* Import all the necessary libraries.
# Import necessary libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import sklearn.metrics as metrics
from scipy import stats
from scipy.stats import zscore
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from sklearn import model_selection
from sklearn.tree import export_graphviz
from IPython.display import Image
from sklearn.tree import plot_tree
from imblearn.over_sampling import RandomOverSampler
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.cluster import KMeans
import warnings
warnings.filterwarnings("ignore")
# READ THE CARNAME.CSV FILE DATA INTO CAR_NAME DATAFRAME
CAR_NAME = pd.read_csv("Car name.csv")
# PRINT THE FIRST 5 RECORDS OF THE DATA FRAME
print(CAR_NAME.head())
car_name 0 chevrolet chevelle malibu 1 buick skylark 320 2 plymouth satellite 3 amc rebel sst 4 ford torino
# PRINT THE LAST 5 RECORDS OF THE DATA FRAME
print(CAR_NAME.tail())
car_name 393 ford mustang gl 394 vw pickup 395 dodge rampage 396 ford ranger 397 chevy s-10
THERE ARE TOTAL OF 398 RECORDS IN THE CSV FILE WHICH ARE CAR NAMES AND THE DATA IS LOADED INTO CAR_NAME DATASET
# READ THE CAR-ATTRIBUTES.JSON FILE DATA INTO DATAFRAME NAMED CAR_DATA
CAR_DATA = pd.read_json("Car-Attributes.json")
# PRINT THE FIRST FIVE RECORDS OF THE CAR_DATA
print(CAR_DATA.head())
mpg cyl disp hp wt acc yr origin 0 18.0 8 307.0 130 3504 12.0 70 1 1 15.0 8 350.0 165 3693 11.5 70 1 2 18.0 8 318.0 150 3436 11.0 70 1 3 16.0 8 304.0 150 3433 12.0 70 1 4 17.0 8 302.0 140 3449 10.5 70 1
# PRINT THE LAST FIVE RECORDS OF THE CAR_DATA
print(CAR_DATA.tail())
mpg cyl disp hp wt acc yr origin 393 27.0 4 140.0 86 2790 15.6 82 1 394 44.0 4 97.0 52 2130 24.6 82 2 395 32.0 4 135.0 84 2295 11.6 82 1 396 28.0 4 120.0 79 2625 18.6 82 1 397 31.0 4 119.0 82 2720 19.4 82 1
THERE ARE TOTAL OF 398 RECORDS IN THE DATA FRAME CAR_DATA
# MERGE THE DATA FRAMES CAR_NAME AND CAR_DATA INTO FINAL DATA FRAME CARS
# SINCE THERE ARE NO COMMON COLUMNS IN THE BOTH DATA FRAMES, WE CANNOT MERGE OR JOIN THE BOTH DATA FRAMES AS THERE IS NO
# POSSIBILITY TO SPECIFY A CONDITION ON DATA SETS. SO WE ARE PROCEEDING WITH THE APPENDING OF DATA FRAMES AS BOTH THE
# DATA SETS CONTAINS EQUAL ROWS AND WE CAN ASSIGN THE CAR NAMES IN CAR_NAME TO THE PERTAINING ROW DATA IN THE CAR_DATA
CARS = CAR_NAME.join(CAR_DATA)
#PRINT THE FIRST FIVE RECORDS OF THE NEW DATA FRAME CARS
print(CARS.head())
car_name mpg cyl disp hp wt acc yr origin 0 chevrolet chevelle malibu 18.0 8 307.0 130 3504 12.0 70 1 1 buick skylark 320 15.0 8 350.0 165 3693 11.5 70 1 2 plymouth satellite 18.0 8 318.0 150 3436 11.0 70 1 3 amc rebel sst 16.0 8 304.0 150 3433 12.0 70 1 4 ford torino 17.0 8 302.0 140 3449 10.5 70 1
#PRINT THE LAST FIVE RECORDS OF THE NEW DATA FRAME CARS
print(CARS.tail())
car_name mpg cyl disp hp wt acc yr origin 393 ford mustang gl 27.0 4 140.0 86 2790 15.6 82 1 394 vw pickup 44.0 4 97.0 52 2130 24.6 82 2 395 dodge rampage 32.0 4 135.0 84 2295 11.6 82 1 396 ford ranger 28.0 4 120.0 79 2625 18.6 82 1 397 chevy s-10 31.0 4 119.0 82 2720 19.4 82 1
THE MERGED DATA FRAME CARS CONTAINS 398 ROWS OF CAR MODEL NAMES AND RELATED DATA.
# PRINT THE DATA TYPES OF ALL THE COLUMNS IN THE CARS DATA FRAME
print(CARS.dtypes)
car_name object mpg float64 cyl int64 disp float64 hp object wt int64 acc float64 yr int64 origin int64 dtype: object
# PRINT INFORMATION OF CARS DATA
print(CARS.info())
print('\n\nTOTAL NO OF ROWS AND COLUMNS IN THE CARS DATA FRAME:',CARS.shape)
<class 'pandas.core.frame.DataFrame'> RangeIndex: 398 entries, 0 to 397 Data columns (total 9 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 car_name 398 non-null object 1 mpg 398 non-null float64 2 cyl 398 non-null int64 3 disp 398 non-null float64 4 hp 398 non-null object 5 wt 398 non-null int64 6 acc 398 non-null float64 7 yr 398 non-null int64 8 origin 398 non-null int64 dtypes: float64(3), int64(4), object(2) memory usage: 28.1+ KB None TOTAL NO OF ROWS AND COLUMNS IN THE CARS DATA FRAME: (398, 9)
FROM THE ABOVE INFORMATION,
* THERE ARE 398 FIELDS IN ALL THE COLUMNS.
* THERE ARE NO NULL FIELDS IN THE DATA.
* HORSE POWER COLUMN CONTAINS INTEGER DATA BUT REPRESENTED AS OBJECT DATA TYPE.
# PRINT ALL THE UNIQUE VALUES IN THE HORSE POWER COLUMN AND SEE WHETHER THERE ARE ANY STRINGS OR SPECIAL CHARACTERS
print(CARS.hp.unique())
print('\n\nROWS IN CARS DATA FRAME HAS ? IN HORSE POWER COLUMN:\n',CARS[CARS['hp'] == '?'])
[130 165 150 140 198 220 215 225 190 170 160 95 97 85 88 46 87 90 113 200
210 193 '?' 100 105 175 153 180 110 72 86 70 76 65 69 60 80 54 208 155
112 92 145 137 158 167 94 107 230 49 75 91 122 67 83 78 52 61 93 148 129
96 71 98 115 53 81 79 120 152 102 108 68 58 149 89 63 48 66 139 103 125
133 138 135 142 77 62 132 84 64 74 116 82]
ROWS IN CARS DATA FRAME HAS ? IN HORSE POWER COLUMN:
car_name mpg cyl disp hp wt acc yr origin
32 ford pinto 25.0 4 98.0 ? 2046 19.0 71 1
126 ford maverick 21.0 6 200.0 ? 2875 17.0 74 1
330 renault lecar deluxe 40.9 4 85.0 ? 1835 17.3 80 2
336 ford mustang cobra 23.6 4 140.0 ? 2905 14.3 80 1
354 renault 18i 34.5 4 100.0 ? 2320 15.8 81 2
374 amc concord dl 23.0 4 151.0 ? 3035 20.5 82 1
* THERE ARE 6 ROWS THAT CONTAINS THE ? IN THE HORSEPOWER COLUMN
* LET US DROP ALL THOSE ROWS THAT CONTAINS ? IN FURTHER ANALYSIS
# PRINT DESCRIBE() ON CARS DATA FRAME
print('\n\n',CARS.describe())
mpg cyl disp wt acc \
count 398.000000 398.000000 398.000000 398.000000 398.000000
mean 23.514573 5.454774 193.425879 2970.424623 15.568090
std 7.815984 1.701004 104.269838 846.841774 2.757689
min 9.000000 3.000000 68.000000 1613.000000 8.000000
25% 17.500000 4.000000 104.250000 2223.750000 13.825000
50% 23.000000 4.000000 148.500000 2803.500000 15.500000
75% 29.000000 8.000000 262.000000 3608.000000 17.175000
max 46.600000 8.000000 455.000000 5140.000000 24.800000
yr origin
count 398.000000 398.000000
mean 76.010050 1.572864
std 3.697627 0.802055
min 70.000000 1.000000
25% 73.000000 1.000000
50% 76.000000 1.000000
75% 79.000000 2.000000
max 82.000000 3.000000
* THERE ARE NO NEGATIVE VALUES IN THE COLUMNS
* DISP AND WT COLUMNS ARE MORE RIGHT SKEWED
* PRESENSE OF OUT LIERS IN THE DISP AND WT COLUMNS
# CHECK FOR MISSING VALUES IN THE EACH COLUMN IN THE CARS DATA FRAME
print(CARS.isna().sum())
car_name 0 mpg 0 cyl 0 disp 0 hp 0 wt 0 acc 0 yr 0 origin 0 dtype: int64
* THERE ARE NO NULL / MISSING VALUES IN THE CARS DATA FRAME
# PRINT THE COUNT OF DUPLICATE VALUES IN THE CARS DATA FRAME. IF PRESENT, WE WILL TRY TO IMPUTE THOSE DATA
print('TOTAL NUMBER OF DUPLICATE VALUES PRESENT IN CARS DATA FRAME:',CARS.duplicated().sum())
TOTAL NUMBER OF DUPLICATE VALUES PRESENT IN CARS DATA FRAME: 0
* THERE ARE NO DUPLICATE VALUES PRESENT IN THE CARS DATA FRAME
sns.pairplot(CARS, hue = "origin", diag_kind = "kde",kind = "scatter", markers=["o", "s", "D"],palette = "Set2")
plt.show()
*WE USED HUE ON ORIGIN AND PLOTTED PAIR PLOT FOR ALL THE COLUMNS IN THE CARS DATA
ax = sns.scatterplot(data=CARS, x="wt", y="disp", hue="cyl", palette="Set2")
ax.set_xlabel("WEIGHT")
ax.set_ylabel("DISPLACEMENT")
plt.show()
*WE HAVE DRAWN SCATTER PLOT AGAINST WEIGHT AND DISPLACEMENT FROM THE CARS DATA.
*WE USED CYCLINDER COLUMN DATA TO DISTINGUISH THE DATA POINTS.
*CYLINDER VALUE 8,4,6 HAVE MORE DATA POINTS AND 3,5 HAS LESS DATA POINTS
ax = sns.scatterplot(data=CARS, x="wt", y="mpg", hue="cyl", palette="tab10")
ax.set_xlabel("WEIGHT")
ax.set_ylabel("MPG")
plt.show()
* WE HAVE DRAWN A SCATTER PLOT FOR WEIGHT AGAINST MPG WITH CYLINDER AS HUE
* CYLINDER WITH 4,6,8 HAVE MAXIMUM DATA POINTS AGAINST CYLINDER VALUE 3,5
for i in CARS.columns:
print('UNIQUE VALUES IN COLUMN ',i)
print(CARS[i].unique())
print('\n')
UNIQUE VALUES IN COLUMN car_name ['chevrolet chevelle malibu' 'buick skylark 320' 'plymouth satellite' 'amc rebel sst' 'ford torino' 'ford galaxie 500' 'chevrolet impala' 'plymouth fury iii' 'pontiac catalina' 'amc ambassador dpl' 'dodge challenger se' "plymouth 'cuda 340" 'chevrolet monte carlo' 'buick estate wagon (sw)' 'toyota corona mark ii' 'plymouth duster' 'amc hornet' 'ford maverick' 'datsun pl510' 'volkswagen 1131 deluxe sedan' 'peugeot 504' 'audi 100 ls' 'saab 99e' 'bmw 2002' 'amc gremlin' 'ford f250' 'chevy c20' 'dodge d200' 'hi 1200d' 'chevrolet vega 2300' 'toyota corona' 'ford pinto' 'plymouth satellite custom' 'ford torino 500' 'amc matador' 'pontiac catalina brougham' 'dodge monaco (sw)' 'ford country squire (sw)' 'pontiac safari (sw)' 'amc hornet sportabout (sw)' 'chevrolet vega (sw)' 'pontiac firebird' 'ford mustang' 'mercury capri 2000' 'opel 1900' 'peugeot 304' 'fiat 124b' 'toyota corolla 1200' 'datsun 1200' 'volkswagen model 111' 'plymouth cricket' 'toyota corona hardtop' 'dodge colt hardtop' 'volkswagen type 3' 'chevrolet vega' 'ford pinto runabout' 'amc ambassador sst' 'mercury marquis' 'buick lesabre custom' 'oldsmobile delta 88 royale' 'chrysler newport royal' 'mazda rx2 coupe' 'amc matador (sw)' 'chevrolet chevelle concours (sw)' 'ford gran torino (sw)' 'plymouth satellite custom (sw)' 'volvo 145e (sw)' 'volkswagen 411 (sw)' 'peugeot 504 (sw)' 'renault 12 (sw)' 'ford pinto (sw)' 'datsun 510 (sw)' 'toyouta corona mark ii (sw)' 'dodge colt (sw)' 'toyota corolla 1600 (sw)' 'buick century 350' 'chevrolet malibu' 'ford gran torino' 'dodge coronet custom' 'mercury marquis brougham' 'chevrolet caprice classic' 'ford ltd' 'plymouth fury gran sedan' 'chrysler new yorker brougham' 'buick electra 225 custom' 'amc ambassador brougham' 'plymouth valiant' 'chevrolet nova custom' 'volkswagen super beetle' 'ford country' 'plymouth custom suburb' 'oldsmobile vista cruiser' 'toyota carina' 'datsun 610' 'maxda rx3' 'mercury capri v6' 'fiat 124 sport coupe' 'chevrolet monte carlo s' 'pontiac grand prix' 'fiat 128' 'opel manta' 'audi 100ls' 'volvo 144ea' 'dodge dart custom' 'saab 99le' 'toyota mark ii' 'oldsmobile omega' 'chevrolet nova' 'datsun b210' 'chevrolet chevelle malibu classic' 'plymouth satellite sebring' 'buick century luxus (sw)' 'dodge coronet custom (sw)' 'audi fox' 'volkswagen dasher' 'datsun 710' 'dodge colt' 'fiat 124 tc' 'honda civic' 'subaru' 'fiat x1.9' 'plymouth valiant custom' 'mercury monarch' 'chevrolet bel air' 'plymouth grand fury' 'buick century' 'chevroelt chevelle malibu' 'plymouth fury' 'buick skyhawk' 'chevrolet monza 2+2' 'ford mustang ii' 'toyota corolla' 'pontiac astro' 'volkswagen rabbit' 'amc pacer' 'volvo 244dl' 'honda civic cvcc' 'fiat 131' 'capri ii' 'renault 12tl' 'dodge coronet brougham' 'chevrolet chevette' 'chevrolet woody' 'vw rabbit' 'dodge aspen se' 'ford granada ghia' 'pontiac ventura sj' 'amc pacer d/l' 'datsun b-210' 'volvo 245' 'plymouth volare premier v8' 'mercedes-benz 280s' 'cadillac seville' 'chevy c10' 'ford f108' 'dodge d100' 'honda accord cvcc' 'buick opel isuzu deluxe' 'renault 5 gtl' 'plymouth arrow gs' 'datsun f-10 hatchback' 'oldsmobile cutlass supreme' 'dodge monaco brougham' 'mercury cougar brougham' 'chevrolet concours' 'buick skylark' 'plymouth volare custom' 'ford granada' 'pontiac grand prix lj' 'chevrolet monte carlo landau' 'chrysler cordoba' 'ford thunderbird' 'volkswagen rabbit custom' 'pontiac sunbird coupe' 'toyota corolla liftback' 'ford mustang ii 2+2' 'dodge colt m/m' 'subaru dl' 'datsun 810' 'bmw 320i' 'mazda rx-4' 'volkswagen rabbit custom diesel' 'ford fiesta' 'mazda glc deluxe' 'datsun b210 gx' 'oldsmobile cutlass salon brougham' 'dodge diplomat' 'mercury monarch ghia' 'pontiac phoenix lj' 'ford fairmont (auto)' 'ford fairmont (man)' 'plymouth volare' 'amc concord' 'buick century special' 'mercury zephyr' 'dodge aspen' 'amc concord d/l' 'buick regal sport coupe (turbo)' 'ford futura' 'dodge magnum xe' 'datsun 510' 'dodge omni' 'toyota celica gt liftback' 'plymouth sapporo' 'oldsmobile starfire sx' 'datsun 200-sx' 'audi 5000' 'volvo 264gl' 'saab 99gle' 'peugeot 604sl' 'volkswagen scirocco' 'honda accord lx' 'pontiac lemans v6' 'mercury zephyr 6' 'ford fairmont 4' 'amc concord dl 6' 'dodge aspen 6' 'ford ltd landau' 'mercury grand marquis' 'dodge st. regis' 'chevrolet malibu classic (sw)' 'chrysler lebaron town @ country (sw)' 'vw rabbit custom' 'maxda glc deluxe' 'dodge colt hatchback custom' 'amc spirit dl' 'mercedes benz 300d' 'cadillac eldorado' 'plymouth horizon' 'plymouth horizon tc3' 'datsun 210' 'fiat strada custom' 'buick skylark limited' 'chevrolet citation' 'oldsmobile omega brougham' 'pontiac phoenix' 'toyota corolla tercel' 'datsun 310' 'ford fairmont' 'audi 4000' 'toyota corona liftback' 'mazda 626' 'datsun 510 hatchback' 'mazda glc' 'vw rabbit c (diesel)' 'vw dasher (diesel)' 'audi 5000s (diesel)' 'mercedes-benz 240d' 'honda civic 1500 gl' 'renault lecar deluxe' 'vokswagen rabbit' 'datsun 280-zx' 'mazda rx-7 gs' 'triumph tr7 coupe' 'ford mustang cobra' 'honda accord' 'plymouth reliant' 'dodge aries wagon (sw)' 'toyota starlet' 'plymouth champ' 'honda civic 1300' 'datsun 210 mpg' 'toyota tercel' 'mazda glc 4' 'plymouth horizon 4' 'ford escort 4w' 'ford escort 2h' 'volkswagen jetta' 'renault 18i' 'honda prelude' 'datsun 200sx' 'peugeot 505s turbo diesel' 'volvo diesel' 'toyota cressida' 'datsun 810 maxima' 'oldsmobile cutlass ls' 'ford granada gl' 'chrysler lebaron salon' 'chevrolet cavalier' 'chevrolet cavalier wagon' 'chevrolet cavalier 2-door' 'pontiac j2000 se hatchback' 'dodge aries se' 'ford fairmont futura' 'amc concord dl' 'volkswagen rabbit l' 'mazda glc custom l' 'mazda glc custom' 'plymouth horizon miser' 'mercury lynx l' 'nissan stanza xe' 'honda civic (auto)' 'datsun 310 gx' 'buick century limited' 'oldsmobile cutlass ciera (diesel)' 'chrysler lebaron medallion' 'ford granada l' 'toyota celica gt' 'dodge charger 2.2' 'chevrolet camaro' 'ford mustang gl' 'vw pickup' 'dodge rampage' 'ford ranger' 'chevy s-10'] UNIQUE VALUES IN COLUMN mpg [18. 15. 16. 17. 14. 24. 22. 21. 27. 26. 25. 10. 11. 9. 28. 19. 12. 13. 23. 30. 31. 35. 20. 29. 32. 33. 17.5 15.5 14.5 22.5 24.5 18.5 29.5 26.5 16.5 31.5 36. 25.5 33.5 20.5 30.5 21.5 43.1 36.1 32.8 39.4 19.9 19.4 20.2 19.2 25.1 20.6 20.8 18.6 18.1 17.7 27.5 27.2 30.9 21.1 23.2 23.8 23.9 20.3 21.6 16.2 19.8 22.3 17.6 18.2 16.9 31.9 34.1 35.7 27.4 25.4 34.2 34.5 31.8 37.3 28.4 28.8 26.8 41.5 38.1 32.1 37.2 26.4 24.3 19.1 34.3 29.8 31.3 37. 32.2 46.6 27.9 40.8 44.3 43.4 36.4 44.6 40.9 33.8 32.7 23.7 23.6 32.4 26.6 25.8 23.5 39.1 39. 35.1 32.3 37.7 34.7 34.4 29.9 33.7 32.9 31.6 28.1 30.7 24.2 22.4 34. 38. 44. ] UNIQUE VALUES IN COLUMN cyl [8 4 6 3 5] UNIQUE VALUES IN COLUMN disp [307. 350. 318. 304. 302. 429. 454. 440. 455. 390. 383. 340. 400. 113. 198. 199. 200. 97. 110. 107. 104. 121. 360. 140. 98. 232. 225. 250. 351. 258. 122. 116. 79. 88. 71. 72. 91. 97.5 70. 120. 96. 108. 155. 68. 114. 156. 76. 83. 90. 231. 262. 134. 119. 171. 115. 101. 305. 85. 130. 168. 111. 260. 151. 146. 80. 78. 105. 131. 163. 89. 267. 86. 183. 141. 173. 135. 81. 100. 145. 112. 181. 144. ] UNIQUE VALUES IN COLUMN hp [130 165 150 140 198 220 215 225 190 170 160 95 97 85 88 46 87 90 113 200 210 193 '?' 100 105 175 153 180 110 72 86 70 76 65 69 60 80 54 208 155 112 92 145 137 158 167 94 107 230 49 75 91 122 67 83 78 52 61 93 148 129 96 71 98 115 53 81 79 120 152 102 108 68 58 149 89 63 48 66 139 103 125 133 138 135 142 77 62 132 84 64 74 116 82] UNIQUE VALUES IN COLUMN wt [3504 3693 3436 3433 3449 4341 4354 4312 4425 3850 3563 3609 3761 3086 2372 2833 2774 2587 2130 1835 2672 2430 2375 2234 2648 4615 4376 4382 4732 2264 2228 2046 2634 3439 3329 3302 3288 4209 4464 4154 4096 4955 4746 5140 2962 2408 3282 3139 2220 2123 2074 2065 1773 1613 1834 1955 2278 2126 2254 2226 4274 4385 4135 4129 3672 4633 4502 4456 4422 2330 3892 4098 4294 4077 2933 2511 2979 2189 2395 2288 2506 2164 2100 4100 3988 4042 3777 4952 4363 4237 4735 4951 3821 3121 3278 2945 3021 2904 1950 4997 4906 4654 4499 2789 2279 2401 2379 2124 2310 2472 2265 4082 4278 1867 2158 2582 2868 3399 2660 2807 3664 3102 2875 2901 3336 2451 1836 2542 3781 3632 3613 4141 4699 4457 4638 4257 2219 1963 2300 1649 2003 2125 2108 2246 2489 2391 2000 3264 3459 3432 3158 4668 4440 4498 4657 3907 3897 3730 3785 3039 3221 3169 2171 2639 2914 2592 2702 2223 2545 2984 1937 3211 2694 2957 2671 1795 2464 2572 2255 2202 4215 4190 3962 3233 3353 3012 3085 2035 3651 3574 3645 3193 1825 1990 2155 2565 3150 3940 3270 2930 3820 4380 4055 3870 3755 2045 1945 3880 4060 4140 4295 3520 3425 3630 3525 4220 4165 4325 4335 1940 2740 2755 2051 2075 1985 2190 2815 2600 2720 1800 2070 3365 3735 3570 3535 3155 2965 3430 3210 3380 3070 3620 3410 3445 3205 4080 2560 2230 2515 2745 2855 2405 2830 3140 2795 2135 3245 2990 2890 3265 3360 3840 3725 3955 3830 4360 4054 3605 1925 1975 1915 2670 3530 3900 3190 3420 2200 2150 2020 2595 2700 2556 2144 1968 2120 2019 2678 2870 3003 3381 2188 2711 2434 2110 2800 2085 2335 2950 3250 1850 2145 1845 2910 2420 2500 2905 2290 2490 2635 2620 2725 2385 1755 1875 1760 2050 2215 2380 2320 2210 2350 2615 3230 3160 2900 3415 3060 3465 2605 2640 2575 2525 2735 2865 3035 1980 2025 1970 2160 2205 2245 1965 1995 3015 2585 2835 2665 2370 2790 2295 2625] UNIQUE VALUES IN COLUMN acc [12. 11.5 11. 10.5 10. 9. 8.5 8. 9.5 15. 15.5 16. 14.5 20.5 17.5 12.5 14. 13.5 18.5 19. 13. 19.5 18. 17. 23.5 16.5 21. 16.9 14.9 17.7 15.3 13.9 12.8 15.4 17.6 22.2 22.1 14.2 17.4 16.2 17.8 12.2 16.4 13.6 15.7 13.2 21.9 16.7 12.1 14.8 18.6 16.8 13.7 11.1 11.4 18.2 15.8 15.9 14.1 21.5 14.4 19.4 19.2 17.2 18.7 15.1 13.4 11.2 14.7 16.6 17.3 15.2 14.3 20.1 24.8 11.3 12.9 18.8 18.1 17.9 21.7 23.7 19.9 21.8 13.8 12.6 16.1 20.7 18.3 20.4 19.6 17.1 15.6 24.6 11.6] UNIQUE VALUES IN COLUMN yr [70 71 72 73 74 75 76 77 78 79 80 81 82] UNIQUE VALUES IN COLUMN origin [1 3 2]
* WE HAVE PRINTED UNIQUE VALUES IN ALL THE COLUMNS OF THE CAR DATA.
* COLUMN HP HAS "?" IN SOME OF THE ROWS.
print('ROWS THAT CONTAIN ? IN THE HORSE POWER COLUMN:\n',CARS[CARS['hp'] == '?'])
ROWS THAT CONTAIN ? IN THE HORSE POWER COLUMN:
car_name mpg cyl disp hp wt acc yr origin
32 ford pinto 25.0 4 98.0 ? 2046 19.0 71 1
126 ford maverick 21.0 6 200.0 ? 2875 17.0 74 1
330 renault lecar deluxe 40.9 4 85.0 ? 1835 17.3 80 2
336 ford mustang cobra 23.6 4 140.0 ? 2905 14.3 80 1
354 renault 18i 34.5 4 100.0 ? 2320 15.8 81 2
374 amc concord dl 23.0 4 151.0 ? 3035 20.5 82 1
*THERE ARE 6 ROWS IN THE CARS DATA FRAME THAT CONTAINS VALUES "?"
*WE WILL DROP ALL THOSE 6 ROWS THAT CONTAINS THE SPECIAL CHARACTER "?"
#WE CAN EITHER PROCEED TO DROP THE ROWS THAT HAS "?" SYMBOL OR REPLACE THE "?" WITH MEDIAN VALUE
#CARS = CARS[CARS.hp != '?']
#print('SHAPE OF THE DATA FRAME CARS AFTER DROPPING THE 6 ROWS THAT CONTAINS SPECIAL CHARACTER:',CARS.shape)
CARS['hp'].replace("?",np.nan, inplace=True)
CARS['hp'].fillna((CARS['hp'].median()), inplace=True)
# CHANGE THE COLUMN NAMES FOR MORE READABILITY.
CARS.rename(columns = {'car_name':'NAME','mpg':'MPG','cyl':'CYLINDER','disp':'DISPLACEMENT',
'hp':'HORSE POWER','wt':'WEIGHT','acc':'ACCELERATION','yr':'MODEL YEAR',
'origin':'ORIGIN'},inplace = True)
CARS['HORSE POWER'] = CARS['HORSE POWER'].astype('float')
print(CARS.describe())
MPG CYLINDER DISPLACEMENT HORSE POWER WEIGHT \
count 398.000000 398.000000 398.000000 398.000000 398.000000
mean 23.514573 5.454774 193.425879 104.304020 2970.424623
std 7.815984 1.701004 104.269838 38.222625 846.841774
min 9.000000 3.000000 68.000000 46.000000 1613.000000
25% 17.500000 4.000000 104.250000 76.000000 2223.750000
50% 23.000000 4.000000 148.500000 93.500000 2803.500000
75% 29.000000 8.000000 262.000000 125.000000 3608.000000
max 46.600000 8.000000 455.000000 230.000000 5140.000000
ACCELERATION MODEL YEAR ORIGIN
count 398.000000 398.000000 398.000000
mean 15.568090 76.010050 1.572864
std 2.757689 3.697627 0.802055
min 8.000000 70.000000 1.000000
25% 13.825000 73.000000 1.000000
50% 15.500000 76.000000 1.000000
75% 17.175000 79.000000 2.000000
max 24.800000 82.000000 3.000000
print('CORRELATION MATRIX:')
CARS.corr(method = 'pearson')
CORRELATION MATRIX:
| MPG | CYLINDER | DISPLACEMENT | HORSE POWER | WEIGHT | ACCELERATION | MODEL YEAR | ORIGIN | |
|---|---|---|---|---|---|---|---|---|
| MPG | 1.000000 | -0.775396 | -0.804203 | -0.773453 | -0.831741 | 0.420289 | 0.579267 | 0.563450 |
| CYLINDER | -0.775396 | 1.000000 | 0.950721 | 0.841284 | 0.896017 | -0.505419 | -0.348746 | -0.562543 |
| DISPLACEMENT | -0.804203 | 0.950721 | 1.000000 | 0.895778 | 0.932824 | -0.543684 | -0.370164 | -0.609409 |
| HORSE POWER | -0.773453 | 0.841284 | 0.895778 | 1.000000 | 0.862442 | -0.686590 | -0.413733 | -0.452096 |
| WEIGHT | -0.831741 | 0.896017 | 0.932824 | 0.862442 | 1.000000 | -0.417457 | -0.306564 | -0.581024 |
| ACCELERATION | 0.420289 | -0.505419 | -0.543684 | -0.686590 | -0.417457 | 1.000000 | 0.288137 | 0.205873 |
| MODEL YEAR | 0.579267 | -0.348746 | -0.370164 | -0.413733 | -0.306564 | 0.288137 | 1.000000 | 0.180662 |
| ORIGIN | 0.563450 | -0.562543 | -0.609409 | -0.452096 | -0.581024 | 0.205873 | 0.180662 | 1.000000 |
*BELOW PAIRS ARE HIGHLY CORRELATED:
.CYLINDER AND DISPLACEMENT
.WEIGHT AND DISPLACEMENT
*BELOW PAIRS ARE LOOSELY CORRELATED:
.MPG AND CYLINDER
.DISPLACEMENT AND MPG
.WEIGHT AND MPG
.HORSEPOWER AND MPG
sns.pairplot(CARS, hue = "ORIGIN", diag_kind = "kde",kind = "scatter", markers=["o", "s", "D"],palette = "Set2")
plt.show()
plt.figure(figsize=(8,8))
ax = sns.heatmap(CARS.corr(method = 'pearson'), annot = True, linewidths = 1, square = True, cmap="YlOrRd")
plt.title("CARS DATA CORRELATION HEAT MAP")
plt. show()
* MODEL YEAR HAS MIN VALUE 70 AND MAX VALUE 82.
* WE ASSUME THAT THE SEGMENTATION PROCESS IS BEING CARRIED OUT IN THE YEAR 83.
* WITH THIS ASSUMPTION, WE CAN CALCULATE THE AGE OF THE VEHICLE BY FORMULA 83 - MODEL YEAR
* ONCE THE AGE IS CALCULATED, WE CAN DROP THE YEAR COLUMN.
* AGE IN THIS SCENARIO REPRESENTS FROM HOW MANY YEARS THE CAR IS BEING USED
CARS['AGE'] = 83 - CARS['MODEL YEAR']
* ORIGIN HAS THREE VALUES 1,2,3. CARS BELONGING TO ORIGIN 1 ARE MORE THAN 2 AND 3 COMBINED.
* WE CAN CREATE 3 VARIABLE FOR 1,2,3 AND NAME THEM AS ORIGIN_1, ORIGIN_2, ORIGIN_3
* THIS HELPS THE MODEL IN UNDERSTANDING THE TRAINING DATA AND CAN BE RESCALED EASILY.
# ONE-HOT ENCODING THE COLUMN ORIGIN AND CREATE THRE VARIABLES ORIGIN_1,ORIGIN_2,ORIGIN_3
OHEC = pd.get_dummies(CARS['ORIGIN'])
OHEC = OHEC.add_prefix('ORIGIN_')
# JOIN THE THREE NEW VARIABLES ORIGIN_1,ORIGIN_2,ORIGIN_3 TO THE OUR DATA FRAME CARS
CARS = CARS.join(OHEC)
CARS_COPY = CARS.copy()
CARS.head()
| NAME | MPG | CYLINDER | DISPLACEMENT | HORSE POWER | WEIGHT | ACCELERATION | MODEL YEAR | ORIGIN | AGE | ORIGIN_1 | ORIGIN_2 | ORIGIN_3 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | chevrolet chevelle malibu | 18.0 | 8 | 307.0 | 130.0 | 3504 | 12.0 | 70 | 1 | 13 | 1 | 0 | 0 |
| 1 | buick skylark 320 | 15.0 | 8 | 350.0 | 165.0 | 3693 | 11.5 | 70 | 1 | 13 | 1 | 0 | 0 |
| 2 | plymouth satellite | 18.0 | 8 | 318.0 | 150.0 | 3436 | 11.0 | 70 | 1 | 13 | 1 | 0 | 0 |
| 3 | amc rebel sst | 16.0 | 8 | 304.0 | 150.0 | 3433 | 12.0 | 70 | 1 | 13 | 1 | 0 | 0 |
| 4 | ford torino | 17.0 | 8 | 302.0 | 140.0 | 3449 | 10.5 | 70 | 1 | 13 | 1 | 0 | 0 |
# DROP THE COLUMNS MODEL YEAR AND ORIGIN FROM THE DATA FRAME
# WE ARE DROPPING MODEL YEAR SINCE WE HAVE CALCULATED THE AGE OF THE CAR AND HAVING MODEL YEAR ALONG WITH AGE IS NOT MUCH USEFUL
# SINCE WE CREATED DUMMIES ON ORIGIN COLUMN, WE CAN DROP THE ORIGIN COLUMN
# SINCE WE HAVE ALL THE CAR NAMES UNIQUE, WE CAN DROP THE CAR NAMES AS IT WILL BE DIFFICULT TO CATEGORISE
CARS = CARS.drop(['MODEL YEAR','ORIGIN','NAME'],axis = 1)
CARS.head()
| MPG | CYLINDER | DISPLACEMENT | HORSE POWER | WEIGHT | ACCELERATION | AGE | ORIGIN_1 | ORIGIN_2 | ORIGIN_3 | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 18.0 | 8 | 307.0 | 130.0 | 3504 | 12.0 | 13 | 1 | 0 | 0 |
| 1 | 15.0 | 8 | 350.0 | 165.0 | 3693 | 11.5 | 13 | 1 | 0 | 0 |
| 2 | 18.0 | 8 | 318.0 | 150.0 | 3436 | 11.0 | 13 | 1 | 0 | 0 |
| 3 | 16.0 | 8 | 304.0 | 150.0 | 3433 | 12.0 | 13 | 1 | 0 | 0 |
| 4 | 17.0 | 8 | 302.0 | 140.0 | 3449 | 10.5 | 13 | 1 | 0 | 0 |
COLS = CARS.drop(['ORIGIN_1','ORIGIN_2','ORIGIN_3'],axis = 1).columns
fig, ax = plt.subplots(3, 3, sharex=True, figsize=(16,16))
x=0
y=0
for i in COLS:
sns.boxplot(y = CARS[i], ax = ax[x, y], palette="Set2")
y = y+1
if y == 3:
x=x+1
y=0
fig.tight_layout()
plt.show()
*THERE ARE PRESENCE OF OUTLIERS IN HORSE POWER, ACCELERATION AND MPG COLUMNS
*DEALING WITH OUTLIERS CAN HELP US IN BETTER TRAINING THE MODEL.
*WE CAN EITHER REMOVE THE OUTLIERS OR TRANSFORM THEM TO FIT THE DATA
*SINCE THE AMOUNT OF OUTLIERS ARE VERY LOW, LET US FIT THEM INTO OUR DATA.
*WE CAN EITHER USE LOG TRANSFORM OR WINSORIZE OR ANY ROBUST SCALER METHOD.
*I WILL BE PROCEEDING WITH WINSORIZE TO DEAL WITH OUTLIERS.
from scipy.stats.mstats import winsorize
CARS['HORSE POWER'] = winsorize(CARS['HORSE POWER'],(0,0.03))
CARS['ACCELERATION'] = winsorize(CARS['ACCELERATION'],(0.02,0.02))
CARS['MPG'] = winsorize(CARS['MPG'],(0,0.01))
fig, ax = plt.subplots(figsize=(7,7))
sns.boxplot(y = CARS['HORSE POWER'])
plt.show()
fig, ax = plt.subplots(figsize=(7,7))
sns.boxplot(y = CARS['ACCELERATION'])
plt.show()
fig, ax = plt.subplots(figsize=(7,7))
sns.boxplot(y = CARS['MPG'])
plt.show()
*ON APPLYING WINSORIZE TO DEAL WITH OUTLIERS
*AFTER TRANSFORMING OUTLIERS, WE CAN SEE THAT THERE ARE NO OUTLIERS IN THE NEW BOX PLOTS ABOVE.
print('CORRELATION MATRIX:')
CARS.corr(method = 'pearson')
CORRELATION MATRIX:
| MPG | CYLINDER | DISPLACEMENT | HORSE POWER | WEIGHT | ACCELERATION | AGE | ORIGIN_1 | ORIGIN_2 | ORIGIN_3 | |
|---|---|---|---|---|---|---|---|---|---|---|
| MPG | 1.000000 | -0.776848 | -0.805559 | -0.787166 | -0.833124 | 0.417775 | -0.579854 | -0.568504 | 0.260098 | 0.441526 |
| CYLINDER | -0.776848 | 1.000000 | 0.950721 | 0.853810 | 0.896017 | -0.508944 | 0.348746 | 0.604351 | -0.352861 | -0.396479 |
| DISPLACEMENT | -0.805559 | 0.950721 | 1.000000 | 0.899950 | 0.932824 | -0.545891 | 0.370164 | 0.651407 | -0.373886 | -0.433505 |
| HORSE POWER | -0.787166 | 0.853810 | 0.899950 | 1.000000 | 0.874271 | -0.690715 | 0.411683 | 0.493953 | -0.287690 | -0.324734 |
| WEIGHT | -0.833124 | 0.896017 | 0.932824 | 0.874271 | 1.000000 | -0.421968 | 0.306564 | 0.598398 | -0.298843 | -0.440817 |
| ACCELERATION | 0.417775 | -0.508944 | -0.545891 | -0.690715 | -0.421968 | 1.000000 | -0.282737 | -0.245515 | 0.191658 | 0.114956 |
| AGE | -0.579854 | 0.348746 | 0.370164 | 0.411683 | 0.306564 | -0.282737 | 1.000000 | 0.139883 | 0.024489 | -0.193101 |
| ORIGIN_1 | -0.568504 | 0.604351 | 0.651407 | 0.493953 | 0.598398 | -0.245515 | 0.139883 | 1.000000 | -0.597198 | -0.643317 |
| ORIGIN_2 | 0.260098 | -0.352861 | -0.373886 | -0.287690 | -0.298843 | 0.191658 | 0.024489 | -0.597198 | 1.000000 | -0.229895 |
| ORIGIN_3 | 0.441526 | -0.396479 | -0.433505 | -0.324734 | -0.440817 | 0.114956 | -0.193101 | -0.643317 | -0.229895 | 1.000000 |
sns.pairplot(CARS, diag_kind = "kde",kind = "scatter", palette = "Set2")
plt.show()
plt.figure(figsize=(8,8))
ax = sns.heatmap(CARS.corr(method = 'pearson'), annot = True, linewidths = 1, square = True, cmap="YlOrRd")
plt.title("CARS DATA CORRELATION HEAT MAP")
plt. show()
* AFTER DROPPING THE COLUMNS THAT DOES NOT HAVE MUCH SIGNIFICANCE AND TRANSFORMING OUTLIERS,
WE HAVE DRAWN CORRELATION MATRIX, PAIR PLOT AND HEAT MAP FOR THE NEW DATA SET FORMED.
* SINCE WE HAVE DEALTH WITH OUTLIERS AND OTHER FEATURES, WE HAVE OBTAINED FINAL DATA FRAME.
* NOW WE WILL PROCEED FOR SCALING THE DATA.
from sklearn.preprocessing import StandardScaler
CARS[COLS] = StandardScaler().fit_transform(CARS[COLS])
print(CARS.head())
MPG CYLINDER DISPLACEMENT HORSE POWER WEIGHT ACCELERATION \
0 -0.707513 1.498191 1.090604 0.714760 0.630870 -1.346115
1 -1.093024 1.498191 1.503514 1.668733 0.854333 -1.535120
2 -0.707513 1.498191 1.196232 1.259888 0.550470 -1.724125
3 -0.964520 1.498191 1.061796 1.259888 0.546923 -1.346115
4 -0.836017 1.498191 1.042591 0.987324 0.565841 -1.913130
AGE ORIGIN_1 ORIGIN_2 ORIGIN_3
0 1.627426 1 0 0
1 1.627426 1 0 0
2 1.627426 1 0 0
3 1.627426 1 0 0
4 1.627426 1 0 0
RANGE = np.arange(2,11,1)
ERRORS = []
for i in RANGE:
CLUSTERS= KMeans(i, n_init = 10)
CLUSTERS.fit(CARS)
LABELS = CLUSTERS.labels_
CENTROIDS = CLUSTERS.cluster_centers_
ERRORS.append(CLUSTERS.inertia_)
CL_DF = pd.DataFrame({"RANGE": RANGE, "ERRORS": ERRORS})
CL_DF[0:11]
| RANGE | ERRORS | |
|---|---|---|
| 0 | 2 | 1462.643320 |
| 1 | 3 | 1095.599851 |
| 2 | 4 | 880.387217 |
| 3 | 5 | 794.288409 |
| 4 | 6 | 728.508003 |
| 5 | 7 | 667.138643 |
| 6 | 8 | 623.367316 |
| 7 | 9 | 579.310427 |
| 8 | 10 | 549.468352 |
from matplotlib import cm
plt.figure(figsize=(15,8))
plt.plot( CL_DF.RANGE, CL_DF.ERRORS, marker = "*" )
plt.show()
*FROM THE ABOVE PLOT, WE CAN CONSIDER OUR ELBOW POINT AT 4.
* FROM THE ABOVE VISUAL, WE CAN SEE THAT THERE IS SHARP CURVE AT 4
* AT 3 AND 7, WE CAN ALSO CONSIDER THEM AS ELBOW POINTS, BUT 4 SEEMS TO BE OPTIMUM POINT.
* HENCE WE CAN CONSIDER THE NO OF CLUSTERS AS 4.
* LET US KMEANS WITH 4 CLUSTERS.
KMEAN = KMeans(n_clusters = 4, n_init = 10, random_state = 16384, verbose = 0)
print(KMEAN.fit(CARS))
KMeans(n_clusters=4, random_state=16384)
LABELS = KMEAN.labels_
CENTROIDS = KMEAN.cluster_centers_
ERRORS.append(CLUSTERS.inertia_)
COUNT = np.bincount(LABELS[LABELS>=0])
CEN_DF = pd.DataFrame(CENTROIDS, columns = list(CARS))
CEN_DF
| MPG | CYLINDER | DISPLACEMENT | HORSE POWER | WEIGHT | ACCELERATION | AGE | ORIGIN_1 | ORIGIN_2 | ORIGIN_3 | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | -1.161198 | 1.498191 | 1.500684 | 1.529582 | 1.400016 | -1.089864 | 0.672556 | 1.000000 | -2.498002e-16 | 1.110223e-16 |
| 1 | 1.173488 | -0.825340 | -0.778107 | -0.760670 | -0.760687 | 0.404669 | -1.101793 | 0.421053 | 1.842105e-01 | 3.947368e-01 |
| 2 | 0.214829 | -0.868333 | -0.823983 | -0.561421 | -0.769895 | 0.294372 | 0.635472 | 0.244898 | 4.591837e-01 | 2.959184e-01 |
| 3 | -0.489198 | 0.405025 | 0.295490 | -0.039282 | 0.320511 | 0.313806 | -0.006206 | 0.901099 | 4.395604e-02 | 5.494505e-02 |
* WE HAVE CREATED 4 CLUSTERS ON CARS DATA CONSIDERING THE ELBOW POINT AT 4.
* GROUP 0,1,2,3 ARE 4 CLUSTERS THAT ARE FORMED.
* CLUSTER 1 HAS MAX MPG VALUE. CARS WHICH ARE NEW TENDS TO HAVE MORE MILEAGE COMPARED TO OLD CARS.
* CLUSTER 0 HAS LOWEST MPG VALUE MAKING THE CLUSTER IS CONTAINING OLD CARS
* CLUSTER 0 HAS HIGHEST CYLINDERS WHERE AS CLUSTER 2 HAS LOWEST
* CLUSTER 0 HAS HIGHEST DISPLACEMENT WHERE AS CLUSTER 2 HAS LOWEST DISPLACEMENT
* CLUSTER 0 HAS HIGHEST HORSEPOWER WHERE AS CLUSTER 1 HAS LOWEST
* CLUSTER 0 HAS HIGHEST WEIGHT WHICH IS FINE CONSIDERING THEY CONTAIN MORE CYLINDERS WHICH IS PROPORTIONAL TO WEIGHT
* CLUSTER 0 HAS LOW ACCELERATION AS THEY ARE MORE WEIGHT WITH MORE CYLINDERS THAT REDUCES ACCELERATION
* CLUSTER 1 HAS HIGHEST ACCELERATION CONSIDERING THEY HAVE LESS CYLINDERS AND LESS WEIGHT
* CLUSTER 0 HAS MOST OF THEIR CARS FROM ORIGIN 1
* CLUSTER 1 HAS MOST OF THE CARS FROM ORIGIN 2,3 COMBINED AND THEN FROM 1
* IN THIS STEP,WE WILL ADD THE CLUSTERS TO THE CARS DATA FRAME.
# CREATE THE GROUPS BASED ON 4 CLUSTERS
GROUPS = KMEAN.predict(CARS)
# ADD THE GROUPS TO THE CARS_COPY DATA FRAME CREATED EARLIER
CARS_COPY['GROUP'] = GROUPS
CARS['GROUP'] = GROUPS
CARS_COPY['GROUP'] = CARS_COPY['GROUP'].astype('category')
CARS['GROUP'] = CARS['GROUP'].astype('category')
CARS_COPY.head()
| NAME | MPG | CYLINDER | DISPLACEMENT | HORSE POWER | WEIGHT | ACCELERATION | MODEL YEAR | ORIGIN | AGE | ORIGIN_1 | ORIGIN_2 | ORIGIN_3 | GROUP | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | chevrolet chevelle malibu | 18.0 | 8 | 307.0 | 130.0 | 3504 | 12.0 | 70 | 1 | 13 | 1 | 0 | 0 | 0 |
| 1 | buick skylark 320 | 15.0 | 8 | 350.0 | 165.0 | 3693 | 11.5 | 70 | 1 | 13 | 1 | 0 | 0 | 0 |
| 2 | plymouth satellite | 18.0 | 8 | 318.0 | 150.0 | 3436 | 11.0 | 70 | 1 | 13 | 1 | 0 | 0 | 0 |
| 3 | amc rebel sst | 16.0 | 8 | 304.0 | 150.0 | 3433 | 12.0 | 70 | 1 | 13 | 1 | 0 | 0 | 0 |
| 4 | ford torino | 17.0 | 8 | 302.0 | 140.0 | 3449 | 10.5 | 70 | 1 | 13 | 1 | 0 | 0 | 0 |
CARS_COPY.tail()
| NAME | MPG | CYLINDER | DISPLACEMENT | HORSE POWER | WEIGHT | ACCELERATION | MODEL YEAR | ORIGIN | AGE | ORIGIN_1 | ORIGIN_2 | ORIGIN_3 | GROUP | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 393 | ford mustang gl | 27.0 | 4 | 140.0 | 86.0 | 2790 | 15.6 | 82 | 1 | 1 | 1 | 0 | 0 | 1 |
| 394 | vw pickup | 44.0 | 4 | 97.0 | 52.0 | 2130 | 24.6 | 82 | 2 | 1 | 0 | 1 | 0 | 1 |
| 395 | dodge rampage | 32.0 | 4 | 135.0 | 84.0 | 2295 | 11.6 | 82 | 1 | 1 | 1 | 0 | 0 | 1 |
| 396 | ford ranger | 28.0 | 4 | 120.0 | 79.0 | 2625 | 18.6 | 82 | 1 | 1 | 1 | 0 | 0 | 1 |
| 397 | chevy s-10 | 31.0 | 4 | 119.0 | 82.0 | 2720 | 19.4 | 82 | 1 | 1 | 1 | 0 | 0 | 1 |
sns.pairplot(CARS, hue = "GROUP", diag_kind = "kde",kind = "scatter", markers=["o", "*", "s", "D"],palette = "Set2")
plt.show()
CARS.boxplot(by = 'GROUP', layout = (3,4), figsize = (20,15))
plt.show()
COLS = CARS.drop(['ORIGIN_1','ORIGIN_2','ORIGIN_3','GROUP'],axis = 1).columns
fig, ax = plt.subplots(3, 3, sharex=True, figsize=(16,16))
x=0
y=0
for i in COLS:
sns.scatterplot(y = CARS[i], x = CARS['GROUP'], ax = ax[x, y], palette="Set2", markers=["o", "*", "s", "D"], hue = CARS['GROUP'])
y = y+1
if y == 3:
x=x+1
y=0
fig.tight_layout()
plt.show()
*WE ARE CREATING THE STRUCT THAT CONTAINS THE DIFFERENT DATA POINTS WE WANT TO TEST THE MODEL.
*WE WILL CREATE A DATAFRAME OF 6 NEW DATA POINTS AND TEST THEM AGAINST THE MODEL.
*INSTEAD OF A SINGLE DATA POINT, I AM TAKING 6 DATA POINTS AND PREDICT THEM AGAINST THE MODEL
*WE WILL CHECK WHETHER THE TRAINED MODEL WILL CLUSTER THE DATA INTO THE 4 GROUPS THAT WERE CREATED EARLIER.
DATA_POINTS = {'NAME':['honda amaze','ford titanium','hyundai verna','maruti alto','fiat punto','tata tiago'],
'MPG':[19,15,16,18,17,19],'CYLINDER':[8,6,7,5,6,6],
'DISPLACEMENT':[352,305,333,256,323,278],
'HORSE POWER':[195,155,179,106,159,173],
'WEIGHT':[3528,2907,3256,2347,3147,2853],
'ACCELERATION':[12.3,10.2,11.6,8.9,11.8,13.5],
'MODEL YEAR':[82,79,81,76,80,82],'ORIGIN':[3,1,2,3,2,1]}
TESTING_DATA = pd.DataFrame(DATA_POINTS)
TEST_DATA_POINTS = TESTING_DATA.copy()
TESTING_DATA
| NAME | MPG | CYLINDER | DISPLACEMENT | HORSE POWER | WEIGHT | ACCELERATION | MODEL YEAR | ORIGIN | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | honda amaze | 19 | 8 | 352 | 195 | 3528 | 12.3 | 82 | 3 |
| 1 | ford titanium | 15 | 6 | 305 | 155 | 2907 | 10.2 | 79 | 1 |
| 2 | hyundai verna | 16 | 7 | 333 | 179 | 3256 | 11.6 | 81 | 2 |
| 3 | maruti alto | 18 | 5 | 256 | 106 | 2347 | 8.9 | 76 | 3 |
| 4 | fiat punto | 17 | 6 | 323 | 159 | 3147 | 11.8 | 80 | 2 |
| 5 | tata tiago | 19 | 6 | 278 | 173 | 2853 | 13.5 | 82 | 1 |
# ONE-HOT ENCODING THE COLUMN ORIGIN AND CREATE THRE VARIABLES ORIGIN_1,ORIGIN_2,ORIGIN_3
OHEC_TEST = pd.get_dummies(TESTING_DATA['ORIGIN'])
OHEC_TEST = OHEC_TEST.add_prefix('ORIGIN_')
# JOIN THE THREE NEW VARIABLES ORIGIN_1,ORIGIN_2,ORIGIN_3 TO THE OUR DATA FRAME C
TESTING_DATA = TESTING_DATA.join(OHEC_TEST)
# CALCULATE THE AGE OF THE NEW CARS IN THE TESTING DATA
TESTING_DATA['AGE'] = 83 - TESTING_DATA['MODEL YEAR']
# DROP THE COLUMNS THAT ARE NOT ESSENTIAL FOR TESTING THE MODEL.
TESTING_DATA = TESTING_DATA.drop(['MODEL YEAR','ORIGIN','NAME'],axis = 1)
TESTING_DATA[COLS] = StandardScaler().fit_transform(TESTING_DATA[COLS])
print(KMEAN.fit(TESTING_DATA))
KMeans(n_clusters=4, random_state=16384)
ERRORS_TEST = []
LABELS_TEST = KMEAN.labels_
CENTROIDS_TESTING = KMEAN.cluster_centers_
COUNT_TEST = np.bincount(LABELS_TEST[LABELS_TEST>=0])
CEN_DF_TEST = pd.DataFrame(CENTROIDS_TESTING, columns = list(TESTING_DATA))
CEN_DF_TEST
| MPG | CYLINDER | DISPLACEMENT | HORSE POWER | WEIGHT | ACCELERATION | ORIGIN_1 | ORIGIN_2 | ORIGIN_3 | AGE | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | -0.894427 | 2.960595e-16 | 0.382768 | 0.113278 | 0.262010 | -0.124035 | 0.333333 | 0.666667 | 0.0 | 0.000000 |
| 1 | 0.447214 | -1.414214e+00 | -1.587210 | -1.973422 | -1.780947 | -1.680107 | 0.000000 | 0.000000 | 1.0 | 1.921538 |
| 2 | 1.118034 | -3.535534e-01 | -0.913539 | 0.423302 | -0.414174 | 1.432037 | 1.000000 | 0.000000 | 0.0 | -0.960769 |
| 3 | 1.118034 | 1.767767e+00 | 1.352446 | 1.210286 | 1.409091 | 0.620174 | 0.000000 | 0.000000 | 1.0 | -0.960769 |
# CREATE THE GROUPS BASED ON 4 CLUSTERS
GROUPS_TEST = KMEAN.predict(TESTING_DATA)
# ADD THE GROUPS TO THE TESTING_DATA DATA FRAME CREATED
TEST_DATA_POINTS['GROUP'] = GROUPS_TEST
TEST_DATA_POINTS['GROUP'] = TEST_DATA_POINTS['GROUP'].astype('category')
TEST_DATA_POINTS
| NAME | MPG | CYLINDER | DISPLACEMENT | HORSE POWER | WEIGHT | ACCELERATION | MODEL YEAR | ORIGIN | GROUP | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | honda amaze | 19 | 8 | 352 | 195 | 3528 | 12.3 | 82 | 3 | 3 |
| 1 | ford titanium | 15 | 6 | 305 | 155 | 2907 | 10.2 | 79 | 1 | 0 |
| 2 | hyundai verna | 16 | 7 | 333 | 179 | 3256 | 11.6 | 81 | 2 | 0 |
| 3 | maruti alto | 18 | 5 | 256 | 106 | 2347 | 8.9 | 76 | 3 | 1 |
| 4 | fiat punto | 17 | 6 | 323 | 159 | 3147 | 11.8 | 80 | 2 | 0 |
| 5 | tata tiago | 19 | 6 | 278 | 173 | 2853 | 13.5 | 82 | 1 | 2 |
* WE HAVE CREATED 6 DATA POINTS TO TEST IN WHICH CLUSTERS THE NEW TEST DATA POINTS FITS INTO
* 3 OF THE CARS FITTED INTO CLUSTER 0, AND REMAINING EACH ONE OF THE CARS FITTED INT0 CLUSTER 1,2,3
* WE CAN SEE THAT OUR MODEL IS ABLE TO PREDICT AND GROUP THE NEW DATA POINTS PASSED FOR GROUPING
VEHICLE = pd.read_csv("vehicle.csv")
print("FIRST FIVE ROWS IN THE VEHICLE DATA FRAME:\n")
VEHICLE.head()
FIRST FIVE ROWS IN THE VEHICLE DATA FRAME:
| compactness | circularity | distance_circularity | radius_ratio | pr.axis_aspect_ratio | max.length_aspect_ratio | scatter_ratio | elongatedness | pr.axis_rectangularity | max.length_rectangularity | scaled_variance | scaled_variance.1 | scaled_radius_of_gyration | scaled_radius_of_gyration.1 | skewness_about | skewness_about.1 | skewness_about.2 | hollows_ratio | class | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 95 | 48.0 | 83.0 | 178.0 | 72.0 | 10 | 162.0 | 42.0 | 20.0 | 159 | 176.0 | 379.0 | 184.0 | 70.0 | 6.0 | 16.0 | 187.0 | 197 | van |
| 1 | 91 | 41.0 | 84.0 | 141.0 | 57.0 | 9 | 149.0 | 45.0 | 19.0 | 143 | 170.0 | 330.0 | 158.0 | 72.0 | 9.0 | 14.0 | 189.0 | 199 | van |
| 2 | 104 | 50.0 | 106.0 | 209.0 | 66.0 | 10 | 207.0 | 32.0 | 23.0 | 158 | 223.0 | 635.0 | 220.0 | 73.0 | 14.0 | 9.0 | 188.0 | 196 | car |
| 3 | 93 | 41.0 | 82.0 | 159.0 | 63.0 | 9 | 144.0 | 46.0 | 19.0 | 143 | 160.0 | 309.0 | 127.0 | 63.0 | 6.0 | 10.0 | 199.0 | 207 | van |
| 4 | 85 | 44.0 | 70.0 | 205.0 | 103.0 | 52 | 149.0 | 45.0 | 19.0 | 144 | 241.0 | 325.0 | 188.0 | 127.0 | 9.0 | 11.0 | 180.0 | 183 | bus |
print("LAST FIVE ROWS IN THE VEHICLE DATA FRAME:\n")
VEHICLE.tail()
LAST FIVE ROWS IN THE VEHICLE DATA FRAME:
| compactness | circularity | distance_circularity | radius_ratio | pr.axis_aspect_ratio | max.length_aspect_ratio | scatter_ratio | elongatedness | pr.axis_rectangularity | max.length_rectangularity | scaled_variance | scaled_variance.1 | scaled_radius_of_gyration | scaled_radius_of_gyration.1 | skewness_about | skewness_about.1 | skewness_about.2 | hollows_ratio | class | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 841 | 93 | 39.0 | 87.0 | 183.0 | 64.0 | 8 | 169.0 | 40.0 | 20.0 | 134 | 200.0 | 422.0 | 149.0 | 72.0 | 7.0 | 25.0 | 188.0 | 195 | car |
| 842 | 89 | 46.0 | 84.0 | 163.0 | 66.0 | 11 | 159.0 | 43.0 | 20.0 | 159 | 173.0 | 368.0 | 176.0 | 72.0 | 1.0 | 20.0 | 186.0 | 197 | van |
| 843 | 106 | 54.0 | 101.0 | 222.0 | 67.0 | 12 | 222.0 | 30.0 | 25.0 | 173 | 228.0 | 721.0 | 200.0 | 70.0 | 3.0 | 4.0 | 187.0 | 201 | car |
| 844 | 86 | 36.0 | 78.0 | 146.0 | 58.0 | 7 | 135.0 | 50.0 | 18.0 | 124 | 155.0 | 270.0 | 148.0 | 66.0 | 0.0 | 25.0 | 190.0 | 195 | car |
| 845 | 85 | 36.0 | 66.0 | 123.0 | 55.0 | 5 | 120.0 | 56.0 | 17.0 | 128 | 140.0 | 212.0 | 131.0 | 73.0 | 1.0 | 18.0 | 186.0 | 190 | van |
print("DATA TYPES OF THE VEHICLE DF:\n",VEHICLE.dtypes)
DATA TYPES OF THE VEHICLE DF: compactness int64 circularity float64 distance_circularity float64 radius_ratio float64 pr.axis_aspect_ratio float64 max.length_aspect_ratio int64 scatter_ratio float64 elongatedness float64 pr.axis_rectangularity float64 max.length_rectangularity int64 scaled_variance float64 scaled_variance.1 float64 scaled_radius_of_gyration float64 scaled_radius_of_gyration.1 float64 skewness_about float64 skewness_about.1 float64 skewness_about.2 float64 hollows_ratio int64 class object dtype: object
*EXCEPT THE CLASS COLUMN, ALL THE REMAINING COLUMNS ARE NUMERICAL DATA.
VEHICLE.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 846 entries, 0 to 845 Data columns (total 19 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 compactness 846 non-null int64 1 circularity 841 non-null float64 2 distance_circularity 842 non-null float64 3 radius_ratio 840 non-null float64 4 pr.axis_aspect_ratio 844 non-null float64 5 max.length_aspect_ratio 846 non-null int64 6 scatter_ratio 845 non-null float64 7 elongatedness 845 non-null float64 8 pr.axis_rectangularity 843 non-null float64 9 max.length_rectangularity 846 non-null int64 10 scaled_variance 843 non-null float64 11 scaled_variance.1 844 non-null float64 12 scaled_radius_of_gyration 844 non-null float64 13 scaled_radius_of_gyration.1 842 non-null float64 14 skewness_about 840 non-null float64 15 skewness_about.1 845 non-null float64 16 skewness_about.2 845 non-null float64 17 hollows_ratio 846 non-null int64 18 class 846 non-null object dtypes: float64(14), int64(4), object(1) memory usage: 125.7+ KB
VEHICLE.describe()
| compactness | circularity | distance_circularity | radius_ratio | pr.axis_aspect_ratio | max.length_aspect_ratio | scatter_ratio | elongatedness | pr.axis_rectangularity | max.length_rectangularity | scaled_variance | scaled_variance.1 | scaled_radius_of_gyration | scaled_radius_of_gyration.1 | skewness_about | skewness_about.1 | skewness_about.2 | hollows_ratio | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 846.000000 | 841.000000 | 842.000000 | 840.000000 | 844.000000 | 846.000000 | 845.000000 | 845.000000 | 843.000000 | 846.000000 | 843.000000 | 844.000000 | 844.000000 | 842.000000 | 840.000000 | 845.000000 | 845.000000 | 846.000000 |
| mean | 93.678487 | 44.828775 | 82.110451 | 168.888095 | 61.678910 | 8.567376 | 168.901775 | 40.933728 | 20.582444 | 147.998818 | 188.631079 | 439.494076 | 174.709716 | 72.447743 | 6.364286 | 12.602367 | 188.919527 | 195.632388 |
| std | 8.234474 | 6.152172 | 15.778292 | 33.520198 | 7.891463 | 4.601217 | 33.214848 | 7.816186 | 2.592933 | 14.515652 | 31.411004 | 176.666903 | 32.584808 | 7.486190 | 4.920649 | 8.936081 | 6.155809 | 7.438797 |
| min | 73.000000 | 33.000000 | 40.000000 | 104.000000 | 47.000000 | 2.000000 | 112.000000 | 26.000000 | 17.000000 | 118.000000 | 130.000000 | 184.000000 | 109.000000 | 59.000000 | 0.000000 | 0.000000 | 176.000000 | 181.000000 |
| 25% | 87.000000 | 40.000000 | 70.000000 | 141.000000 | 57.000000 | 7.000000 | 147.000000 | 33.000000 | 19.000000 | 137.000000 | 167.000000 | 318.000000 | 149.000000 | 67.000000 | 2.000000 | 5.000000 | 184.000000 | 190.250000 |
| 50% | 93.000000 | 44.000000 | 80.000000 | 167.000000 | 61.000000 | 8.000000 | 157.000000 | 43.000000 | 20.000000 | 146.000000 | 179.000000 | 363.500000 | 173.500000 | 71.500000 | 6.000000 | 11.000000 | 188.000000 | 197.000000 |
| 75% | 100.000000 | 49.000000 | 98.000000 | 195.000000 | 65.000000 | 10.000000 | 198.000000 | 46.000000 | 23.000000 | 159.000000 | 217.000000 | 587.000000 | 198.000000 | 75.000000 | 9.000000 | 19.000000 | 193.000000 | 201.000000 |
| max | 119.000000 | 59.000000 | 112.000000 | 333.000000 | 138.000000 | 55.000000 | 265.000000 | 61.000000 | 29.000000 | 188.000000 | 320.000000 | 1018.000000 | 268.000000 | 135.000000 | 22.000000 | 41.000000 | 206.000000 | 211.000000 |
VEHICLE.shape
(846, 19)
VEHICLE['class'].value_counts()
car 429 bus 218 van 199 Name: class, dtype: int64
* THE DATA FRAME CONTAINS OF 846 ROWS AND 19 COLUMNS
* THERE ARE PRESENSE OF NULL VALUES IN SOME OF THE COLUMNS
* GOING FURTHER WE NEED TO DEAL WITH THE NULL VALUES TO PERFORM TESTING ON THE DATA.
* THERE ARE TOTAL 3 CLASSES CAR, BUS AND VAN.
* THERE ARE 429 CARS, 218 BUS AND 199 VAN
*HERE WE WILL CHECK FOR THE PRESENSE OF NULL VALUES IN THE DATA.
*WE CAN EITHER PROCEED WITH REPLACING THE NULL VALUES WITH MEDIAN OR DROP ALL THE ROWS WITH NULL VALUES.
*IN THIS CURRENT SCENARIO, WE WILL PROCEED WITH REPLACING THE NULL VALUES IN THE COLUMNS WITH MEDIAN.
COLUMNS = VEHICLE.columns
COLUMN_WITH_NULL = pd.DataFrame(columns = ['COLUMN_NAME','NULL_VALUES_COUNT'])
COLUMN_NAME = []
NULL_VALUES_COUNT = []
for i in COLUMNS:
COUNT = VEHICLE[i].isnull().sum()
if COUNT > 0:
COLUMN_NAME.append(i)
NULL_VALUES_COUNT.append(COUNT)
COLUMN_WITH_NULL = pd.DataFrame(list(zip(COLUMN_NAME,NULL_VALUES_COUNT)),columns = ['COLUMN_NAME','NULL_VALUES_COUNT'])
print('BELOW ARE THE COLUMNS WITH NULL VALUES AND THEIR COUNTS:\n\n', COLUMN_WITH_NULL)
print('\n\nTHERE ARE TOTAL OF',VEHICLE[VEHICLE.isnull().any(axis = 1)].shape[0],'ROWS WITH NULL VALUES IN THE VEHICLE DATA FRAME')
BELOW ARE THE COLUMNS WITH NULL VALUES AND THEIR COUNTS:
COLUMN_NAME NULL_VALUES_COUNT
0 circularity 5
1 distance_circularity 4
2 radius_ratio 6
3 pr.axis_aspect_ratio 2
4 scatter_ratio 1
5 elongatedness 1
6 pr.axis_rectangularity 3
7 scaled_variance 3
8 scaled_variance.1 2
9 scaled_radius_of_gyration 2
10 scaled_radius_of_gyration.1 4
11 skewness_about 6
12 skewness_about.1 1
13 skewness_about.2 1
THERE ARE TOTAL OF 33 ROWS WITH NULL VALUES IN THE VEHICLE DATA FRAME
# WE ARE GOING TO REPLACE THE NULL VALUES WITH THE MEDIAN OF THE COLUMN
for i in COLUMN_WITH_NULL['COLUMN_NAME']:
MEDIAN = VEHICLE[i].median()
VEHICLE[i] = VEHICLE[i].fillna(MEDIAN)
if VEHICLE[i].isnull().sum() == 0:
print('THERE ARE NO NULL VALUES IN THE DATASET AFTER IMPUTING WITH MEDIAN')
else:
print('THERE ARE NULL VALUES PRESENT IN THE DATA SET. NEED TO IMPUTE FURTHER')
THERE ARE NO NULL VALUES IN THE DATASET AFTER IMPUTING WITH MEDIAN
my_labels = VEHICLE['class'].unique()
colors = ['#ff9999','#66b3ff','#ffcc99']
my_explode = [0.2,0,0]
plt.pie(VEHICLE['class'].value_counts(sort = False),startangle=120,autopct='%1.1f%%',
labels = my_labels,colors = colors,explode = my_explode)
#CIRCLE = plt.Circle( (0,0), 0.7, color='white')
#PLOT = plt.gcf()
#PLOT.gca().add_artist(CIRCLE)
plt.show()
* CLASS COLUMN CONTAINS THREE TYPE OF VEHICLES : VAN, BUS, CARS
* OUT OF THREE CLASS CAR HAS MORE ROWS AND VAN HAS LESS ROWS.
* NO OF CARS ARE MORE THAN VAN AND BUS COMBINED.
* WE ARE GOING TO CHECK FOR DUPILICATE ROWS AND EITHER DROP THEM OR IMPUTE THEM.
ISDUPE = VEHICLE.duplicated().sum()
if ISDUPE==0:
print('\n\nNO DUPLICATE VALUES ARE PRESENT IN THE DATA')
else:
print('\n\nTHERE ARE DUPLICATE VALUES PRESENT IN THE DATA')
NO DUPLICATE VALUES ARE PRESENT IN THE DATA
* THERE ARE NO DUPLICATE VALUES PRESENT IN THE DATA SET.
VEHICLE['class'] = VEHICLE['class'].astype('category')
VEHICLE.describe().T
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| compactness | 846.0 | 93.678487 | 8.234474 | 73.0 | 87.00 | 93.0 | 100.00 | 119.0 |
| circularity | 846.0 | 44.823877 | 6.134272 | 33.0 | 40.00 | 44.0 | 49.00 | 59.0 |
| distance_circularity | 846.0 | 82.100473 | 15.741569 | 40.0 | 70.00 | 80.0 | 98.00 | 112.0 |
| radius_ratio | 846.0 | 168.874704 | 33.401356 | 104.0 | 141.00 | 167.0 | 195.00 | 333.0 |
| pr.axis_aspect_ratio | 846.0 | 61.677305 | 7.882188 | 47.0 | 57.00 | 61.0 | 65.00 | 138.0 |
| max.length_aspect_ratio | 846.0 | 8.567376 | 4.601217 | 2.0 | 7.00 | 8.0 | 10.00 | 55.0 |
| scatter_ratio | 846.0 | 168.887707 | 33.197710 | 112.0 | 147.00 | 157.0 | 198.00 | 265.0 |
| elongatedness | 846.0 | 40.936170 | 7.811882 | 26.0 | 33.00 | 43.0 | 46.00 | 61.0 |
| pr.axis_rectangularity | 846.0 | 20.580378 | 2.588558 | 17.0 | 19.00 | 20.0 | 23.00 | 29.0 |
| max.length_rectangularity | 846.0 | 147.998818 | 14.515652 | 118.0 | 137.00 | 146.0 | 159.00 | 188.0 |
| scaled_variance | 846.0 | 188.596927 | 31.360427 | 130.0 | 167.00 | 179.0 | 217.00 | 320.0 |
| scaled_variance.1 | 846.0 | 439.314421 | 176.496341 | 184.0 | 318.25 | 363.5 | 586.75 | 1018.0 |
| scaled_radius_of_gyration | 846.0 | 174.706856 | 32.546277 | 109.0 | 149.00 | 173.5 | 198.00 | 268.0 |
| scaled_radius_of_gyration.1 | 846.0 | 72.443262 | 7.468734 | 59.0 | 67.00 | 71.5 | 75.00 | 135.0 |
| skewness_about | 846.0 | 6.361702 | 4.903244 | 0.0 | 2.00 | 6.0 | 9.00 | 22.0 |
| skewness_about.1 | 846.0 | 12.600473 | 8.930962 | 0.0 | 5.00 | 11.0 | 19.00 | 41.0 |
| skewness_about.2 | 846.0 | 188.918440 | 6.152247 | 176.0 | 184.00 | 188.0 | 193.00 | 206.0 |
| hollows_ratio | 846.0 | 195.632388 | 7.438797 | 181.0 | 190.25 | 197.0 | 201.00 | 211.0 |
VEHICLE.boxplot(figsize=(35,15))
plt.show()
* THERE ARE PRESENSE OF OUTLIERS IN THE DATA
* WE WILL PROCEED WITH TREATING THE OUTLIERS USING A SUITABLE APPROACH.
* BELOW ARE THE COLUMNS WITH OUTLIERS DATA
* 'radius_ratio','pr.axis_aspect_ratio','max.length_aspect_ratio','scaled_variance',
'scaled_variance.1','scaled_radius_of_gyration.1','skewness_about','skewness_about.1'
COL_OL = ['radius_ratio','pr.axis_aspect_ratio','max.length_aspect_ratio','scaled_variance',
'scaled_variance.1','scaled_radius_of_gyration.1','skewness_about','skewness_about.1']
# WE NEED TO CALCULATE THE QUARTILE 1 AND QUARTILE 3 AND USING THAT INTER QUARTILE RANGE
# THEN WE WILL CALCULATE LOWER RANGE AND UPPER RANGE USING QUARTILE 1,3, IQR
# AFTER THAT NEED TO REPLACE THE OUTLIERS THAT ARE BELOW LOWER RANGE AND ABOVE UPPER RANGE WITH MEDIAN.
for I in COL_OL:
QRT1 = VEHICLE[I].quantile(0.25)
QRT3 = VEHICLE[I].quantile(0.75)
IQR = QRT3-QRT1
LOWER = QRT1-1.5*IQR
UPPER = QRT3+1.5*IQR
VEHICLE.loc[(VEHICLE[I] < LOWER) | (VEHICLE[I] > UPPER), I] = VEHICLE[I].median()
VEHICLE.boxplot(figsize=(35,15))
plt.show()
* WE CAN SEE THAT NOW THE OUTLIERS ARE TOTALLY DEALT WITH.
VEHICLE.describe().T
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| compactness | 846.0 | 93.678487 | 8.234474 | 73.0 | 87.00 | 93.0 | 100.00 | 119.0 |
| circularity | 846.0 | 44.823877 | 6.134272 | 33.0 | 40.00 | 44.0 | 49.00 | 59.0 |
| distance_circularity | 846.0 | 82.100473 | 15.741569 | 40.0 | 70.00 | 80.0 | 98.00 | 112.0 |
| radius_ratio | 846.0 | 168.330969 | 32.147908 | 104.0 | 141.00 | 167.0 | 194.75 | 252.0 |
| pr.axis_aspect_ratio | 846.0 | 61.154846 | 5.613458 | 47.0 | 57.00 | 61.0 | 65.00 | 76.0 |
| max.length_aspect_ratio | 846.0 | 8.118203 | 2.064114 | 3.0 | 7.00 | 8.0 | 10.00 | 13.0 |
| scatter_ratio | 846.0 | 168.887707 | 33.197710 | 112.0 | 147.00 | 157.0 | 198.00 | 265.0 |
| elongatedness | 846.0 | 40.936170 | 7.811882 | 26.0 | 33.00 | 43.0 | 46.00 | 61.0 |
| pr.axis_rectangularity | 846.0 | 20.580378 | 2.588558 | 17.0 | 19.00 | 20.0 | 23.00 | 29.0 |
| max.length_rectangularity | 846.0 | 147.998818 | 14.515652 | 118.0 | 137.00 | 146.0 | 159.00 | 188.0 |
| scaled_variance | 846.0 | 188.430260 | 31.034232 | 130.0 | 167.00 | 179.0 | 216.75 | 288.0 |
| scaled_variance.1 | 846.0 | 437.790780 | 174.346065 | 184.0 | 318.25 | 363.5 | 586.00 | 987.0 |
| scaled_radius_of_gyration | 846.0 | 174.706856 | 32.546277 | 109.0 | 149.00 | 173.5 | 198.00 | 268.0 |
| scaled_radius_of_gyration.1 | 846.0 | 71.943853 | 6.158852 | 59.0 | 67.00 | 71.5 | 75.00 | 87.0 |
| skewness_about | 846.0 | 6.147754 | 4.572950 | 0.0 | 2.00 | 6.0 | 9.00 | 19.0 |
| skewness_about.1 | 846.0 | 12.565012 | 8.877465 | 0.0 | 5.00 | 11.0 | 19.00 | 40.0 |
| skewness_about.2 | 846.0 | 188.918440 | 6.152247 | 176.0 | 184.00 | 188.0 | 193.00 | 206.0 |
| hollows_ratio | 846.0 | 195.632388 | 7.438797 | 181.0 | 190.25 | 197.0 | 201.00 | 211.0 |
VEHICLE.corr()
| compactness | circularity | distance_circularity | radius_ratio | pr.axis_aspect_ratio | max.length_aspect_ratio | scatter_ratio | elongatedness | pr.axis_rectangularity | max.length_rectangularity | scaled_variance | scaled_variance.1 | scaled_radius_of_gyration | scaled_radius_of_gyration.1 | skewness_about | skewness_about.1 | skewness_about.2 | hollows_ratio | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| compactness | 1.000000 | 0.684887 | 0.789928 | 0.721925 | 0.192864 | 0.499928 | 0.812620 | -0.788750 | 0.813694 | 0.676143 | 0.769871 | 0.806170 | 0.585243 | -0.246681 | 0.197308 | 0.156348 | 0.298537 | 0.365552 |
| circularity | 0.684887 | 1.000000 | 0.792320 | 0.638280 | 0.203253 | 0.560470 | 0.847938 | -0.821472 | 0.843400 | 0.961318 | 0.802768 | 0.827462 | 0.925816 | 0.068745 | 0.136351 | -0.009666 | -0.104426 | 0.046351 |
| distance_circularity | 0.789928 | 0.792320 | 1.000000 | 0.794222 | 0.244332 | 0.666809 | 0.905076 | -0.911307 | 0.893025 | 0.774527 | 0.869584 | 0.883943 | 0.705771 | -0.229353 | 0.099107 | 0.262345 | 0.146098 | 0.332732 |
| radius_ratio | 0.721925 | 0.638280 | 0.794222 | 1.000000 | 0.650554 | 0.463958 | 0.769941 | -0.825392 | 0.744139 | 0.579468 | 0.786183 | 0.760257 | 0.550774 | -0.390459 | 0.035755 | 0.179601 | 0.405849 | 0.491758 |
| pr.axis_aspect_ratio | 0.192864 | 0.203253 | 0.244332 | 0.650554 | 1.000000 | 0.150295 | 0.194195 | -0.298144 | 0.163047 | 0.147592 | 0.207101 | 0.196401 | 0.148591 | -0.321070 | -0.056030 | -0.021088 | 0.400882 | 0.415734 |
| max.length_aspect_ratio | 0.499928 | 0.560470 | 0.666809 | 0.463958 | 0.150295 | 1.000000 | 0.490759 | -0.504181 | 0.487931 | 0.642713 | 0.401391 | 0.463249 | 0.397397 | -0.335444 | 0.081898 | 0.141664 | 0.083794 | 0.413174 |
| scatter_ratio | 0.812620 | 0.847938 | 0.905076 | 0.769941 | 0.194195 | 0.490759 | 1.000000 | -0.971601 | 0.989751 | 0.809083 | 0.960883 | 0.980447 | 0.799875 | 0.011314 | 0.064242 | 0.211647 | 0.005628 | 0.118817 |
| elongatedness | -0.788750 | -0.821472 | -0.911307 | -0.825392 | -0.298144 | -0.504181 | -0.971601 | 1.000000 | -0.948996 | -0.775854 | -0.947644 | -0.948851 | -0.766314 | 0.078391 | -0.046943 | -0.183642 | -0.115126 | -0.216905 |
| pr.axis_rectangularity | 0.813694 | 0.843400 | 0.893025 | 0.744139 | 0.163047 | 0.487931 | 0.989751 | -0.948996 | 1.000000 | 0.810934 | 0.947329 | 0.973606 | 0.796690 | 0.027545 | 0.073127 | 0.213801 | -0.018649 | 0.099286 |
| max.length_rectangularity | 0.676143 | 0.961318 | 0.774527 | 0.579468 | 0.147592 | 0.642713 | 0.809083 | -0.775854 | 0.810934 | 1.000000 | 0.750222 | 0.789632 | 0.866450 | 0.053856 | 0.130702 | 0.004129 | -0.103948 | 0.076770 |
| scaled_variance | 0.769871 | 0.802768 | 0.869584 | 0.786183 | 0.207101 | 0.401391 | 0.960883 | -0.947644 | 0.947329 | 0.750222 | 1.000000 | 0.943780 | 0.785073 | 0.025828 | 0.024693 | 0.197122 | 0.015171 | 0.086330 |
| scaled_variance.1 | 0.806170 | 0.827462 | 0.883943 | 0.760257 | 0.196401 | 0.463249 | 0.980447 | -0.948851 | 0.973606 | 0.789632 | 0.943780 | 1.000000 | 0.782972 | 0.009386 | 0.065731 | 0.204941 | 0.017557 | 0.119642 |
| scaled_radius_of_gyration | 0.585243 | 0.925816 | 0.705771 | 0.550774 | 0.148591 | 0.397397 | 0.799875 | -0.766314 | 0.796690 | 0.866450 | 0.785073 | 0.782972 | 1.000000 | 0.215279 | 0.162970 | -0.055667 | -0.224450 | -0.118002 |
| scaled_radius_of_gyration.1 | -0.246681 | 0.068745 | -0.229353 | -0.390459 | -0.321070 | -0.335444 | 0.011314 | 0.078391 | 0.027545 | 0.053856 | 0.025828 | 0.009386 | 0.215279 | 1.000000 | -0.057755 | -0.123996 | -0.832738 | -0.901332 |
| skewness_about | 0.197308 | 0.136351 | 0.099107 | 0.035755 | -0.056030 | 0.081898 | 0.064242 | -0.046943 | 0.073127 | 0.130702 | 0.024693 | 0.065731 | 0.162970 | -0.057755 | 1.000000 | -0.041734 | 0.086661 | 0.062619 |
| skewness_about.1 | 0.156348 | -0.009666 | 0.262345 | 0.179601 | -0.021088 | 0.141664 | 0.211647 | -0.183642 | 0.213801 | 0.004129 | 0.197122 | 0.204941 | -0.055667 | -0.123996 | -0.041734 | 1.000000 | 0.074473 | 0.200651 |
| skewness_about.2 | 0.298537 | -0.104426 | 0.146098 | 0.405849 | 0.400882 | 0.083794 | 0.005628 | -0.115126 | -0.018649 | -0.103948 | 0.015171 | 0.017557 | -0.224450 | -0.832738 | 0.086661 | 0.074473 | 1.000000 | 0.892581 |
| hollows_ratio | 0.365552 | 0.046351 | 0.332732 | 0.491758 | 0.415734 | 0.413174 | 0.118817 | -0.216905 | 0.099286 | 0.076770 | 0.086330 | 0.119642 | -0.118002 | -0.901332 | 0.062619 | 0.200651 | 0.892581 | 1.000000 |
plt.figure(figsize = (20,20))
sns.heatmap(VEHICLE.drop(columns = 'class').corr(), annot = True, linewidths = 1, square = True, cmap="YlOrRd")
plt.show()
sns.pairplot(VEHICLE, hue = "class", diag_kind = "kde",kind = "scatter", markers=["o", "s", "D"],palette = "Set2")
plt.show()
VEHICLE['class']=LabelEncoder().fit_transform(VEHICLE['class'])
* THROUGH LABEL ENCODING, WE HAVE CONVERTED THE CATEGORICAL VARIABLE.
* 1 REPRESENTS CARS, 0 REPRESENTS BUS AND 2 REPRESENTS VAN.
# WE HAVE SPLIT THE DATA INTO X AND Y. WHERE Y CONTAINS THE DEPENDENT VARIABLE AND X CONTAINS INDEPENDENT VARIABLE
X = VEHICLE.drop(columns = 'class')
Y = VEHICLE['class']
# WE ARE GOING TO SPLIT THE DATA INTO TRAIN AND TEST DATA.
X_TRAIN, X_TEST, Y_TRAIN, Y_TEST = train_test_split(X,Y,test_size = 0.3, random_state = 10)
# WE ARE GOING TO STANDARDIZE THE DATA BY APPLYING Z_SCORE ON INDEPENDENT VARIABLES.
X_STD = X.apply(zscore)
# WE ARE GOING TO SPLIT THE DATA INTO TRAIN AND TEST DATA AGAIN AFTER STANDARDISING THE DATA.
X_TRAIN, X_TEST, Y_TRAIN, Y_TEST = train_test_split(X_STD,Y,test_size = 0.3, random_state = 10)
fig = plt.figure(figsize=(15,6))
plt.plot(VEHICLE)
plt.show()
fig = plt.figure(figsize=(15,6))
plt.plot(X_STD)
plt.show()
* FROM THE ABOVE PLOTS, WE CAN SEE THAT AFTER STANDARDIZING, SCALE OF THE VARIABLES CAME DOWN TO VERY LOW VALUES.
from sklearn.svm import SVC
import sklearn.metrics as metrics
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score
MODEL_TYPE = []
MODEL_TYPE.append(('SVM-LINEAR', SVC(kernel='linear',C=1,gamma=.6)))
MODEL_TYPE.append(('SVM-POLY', SVC(kernel='rbf',degree=2,C=.009)))
MODEL_TYPE.append(('SVM-RBF', SVC(kernel='poly',degree=2,gamma=0.1,C=.01)))
RESULTS = pd.DataFrame()
for name, model in MODEL_TYPE:
MODEL_EVAL = model
MODEL_EVAL.fit(X_TRAIN, Y_TRAIN)
Y_PRED = MODEL_EVAL.predict(X_TEST)
ACCURACY = accuracy_score(Y_TEST, Y_PRED)
PRECISION = precision_score(Y_TEST, Y_PRED,average='weighted')
RECALL = recall_score(Y_TEST, Y_PRED,average='weighted')
F1_SCORE = f1_score(Y_TEST, Y_PRED,average='weighted')
RESULTS = RESULTS.append(pd.Series([name,ACCURACY,PRECISION,RECALL,F1_SCORE]),ignore_index=True)
RESULTS.columns = ['MODEL_NAME','ACCURACY','PRECISION','RECALL','F1-SCORE']
print(RESULTS)
MODEL_NAME ACCURACY PRECISION RECALL F1-SCORE 0 SVM-LINEAR 0.933071 0.935659 0.933071 0.933162 1 SVM-POLY 0.492126 0.242188 0.492126 0.324621 2 SVM-RBF 0.515748 0.527575 0.515748 0.373406
* WE HAVE RUN THE SVM MODEL USING LINEAR, POLY AND RBF
* SVM_LINEAR MODEL IS SHOWN TO PERFORM BEST OUT OF OTHER TWO MODELS.
* WE WILL PROCEED TO USE THE LINEAR MODEL AND PRINT THE SCORES FOR TRAIN AND TEST DATA
SVM_MODEL = SVC(kernel='linear',C=1,gamma=.6)
SVM_MODEL.fit(X_TRAIN, Y_TRAIN)
PREDICTION = SVM_MODEL.predict(X_TEST)
print('ACCURACY ON TRAIN DATA:',SVM_MODEL.score(X_TRAIN,Y_TRAIN))
print('\n\nACCURACY ON TEST DATA:',SVM_MODEL.score(X_TEST,Y_TEST))
ACCURACY ON TRAIN DATA: 0.9577702702702703 ACCURACY ON TEST DATA: 0.9330708661417323
* WE COULD SEE THAT TRAIN DATA ACCURACY IS ABOVE 95% AND TEST DATA ABOVE 93%
* SVM LINEAR MODEL BEST FIT THE DATA.
print(classification_report(Y_TEST,PREDICTION),'\n')
precision recall f1-score support
0 0.93 0.94 0.94 71
1 0.97 0.90 0.93 125
2 0.88 0.98 0.93 58
accuracy 0.93 254
macro avg 0.92 0.94 0.93 254
weighted avg 0.94 0.93 0.93 254
from sklearn.decomposition import PCA
PCA_DATA = PCA(n_components=10)
PCA_X = PCA_DATA.fit_transform(X_STD)
PCA_DF = pd.DataFrame(data = PCA_DATA.components_, columns = list(X_STD))
PCA_DF
| compactness | circularity | distance_circularity | radius_ratio | pr.axis_aspect_ratio | max.length_aspect_ratio | scatter_ratio | elongatedness | pr.axis_rectangularity | max.length_rectangularity | scaled_variance | scaled_variance.1 | scaled_radius_of_gyration | scaled_radius_of_gyration.1 | skewness_about | skewness_about.1 | skewness_about.2 | hollows_ratio | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.272503 | 0.287255 | 0.302421 | 0.269714 | 0.097861 | 0.195200 | 0.310524 | -0.309007 | 0.307287 | 0.278154 | 0.299765 | 0.305532 | 0.263238 | -0.041936 | 0.036083 | 0.058720 | 0.038013 | 0.084740 |
| 1 | -0.087044 | 0.131622 | -0.046143 | -0.197931 | -0.257840 | -0.108046 | 0.075285 | -0.013230 | 0.087560 | 0.122154 | 0.077266 | 0.071503 | 0.210582 | 0.503622 | -0.015766 | -0.092746 | -0.501621 | -0.507612 |
| 2 | -0.038185 | -0.201147 | 0.063462 | 0.056285 | -0.061993 | -0.148958 | 0.109043 | -0.090853 | 0.106070 | -0.213685 | 0.144600 | 0.110344 | -0.202870 | 0.073864 | -0.559174 | 0.670680 | -0.062241 | -0.041705 |
| 3 | 0.138675 | -0.038055 | 0.108954 | -0.254355 | -0.612766 | 0.278678 | 0.005393 | 0.065215 | 0.030899 | 0.041467 | -0.064005 | -0.002197 | -0.085540 | -0.115400 | 0.473703 | 0.428426 | -0.027410 | 0.096037 |
| 4 | 0.137101 | -0.138996 | -0.080017 | 0.133744 | 0.123601 | -0.634893 | 0.085557 | -0.079073 | 0.081646 | -0.251113 | 0.147471 | 0.110101 | -0.005212 | 0.138069 | 0.566552 | 0.130870 | 0.180519 | -0.110788 |
| 5 | 0.263611 | -0.071347 | -0.016901 | -0.138184 | -0.577829 | -0.289097 | 0.097747 | -0.075728 | 0.105403 | -0.078196 | 0.132912 | 0.115398 | -0.067057 | -0.131513 | -0.319176 | -0.468405 | 0.280136 | 0.059444 |
| 6 | 0.202717 | -0.392275 | 0.163371 | 0.161911 | 0.092763 | 0.398266 | 0.092352 | -0.104071 | 0.093132 | -0.354564 | 0.068055 | 0.090119 | -0.455293 | 0.085823 | 0.124532 | -0.302518 | -0.258250 | -0.173269 |
| 7 | -0.758796 | -0.067603 | 0.277372 | 0.110545 | -0.186859 | -0.046219 | 0.064620 | -0.192343 | 0.013868 | -0.215163 | 0.195679 | 0.037795 | 0.146753 | -0.330395 | 0.114255 | -0.115404 | -0.094660 | -0.006497 |
| 8 | 0.366686 | 0.055326 | 0.074678 | 0.266667 | -0.038630 | -0.137163 | -0.131568 | 0.289634 | -0.089529 | -0.158232 | 0.042703 | -0.151073 | 0.263771 | -0.555267 | -0.059904 | 0.052385 | -0.379169 | -0.280341 |
| 9 | -0.160045 | 0.182324 | -0.273034 | 0.050599 | 0.034304 | -0.177961 | 0.143133 | 0.079383 | 0.239897 | 0.382739 | -0.166091 | 0.287458 | -0.549627 | -0.362547 | 0.057989 | -0.012900 | -0.187849 | -0.133403 |
COV_MX = np.cov(X_STD.T)
EIGEN_VALUES, EIGEN_VECTORS = np.linalg.eig(COV_MX)
print(EIGEN_VALUES)
[9.74940269e+00 3.35071912e+00 1.19238155e+00 1.13381916e+00 8.83997312e-01 6.66265745e-01 3.18150910e-01 2.28179142e-01 1.31018595e-01 7.98619108e-02 7.33979478e-02 6.46162669e-02 5.16287320e-03 4.01448646e-02 1.98136761e-02 2.27005257e-02 3.22758478e-02 2.93936408e-02]
print(EIGEN_VECTORS)
[[-2.72502890e-01 -8.70435783e-02 3.81852075e-02 1.38675013e-01 -1.37101466e-01 2.63611383e-01 2.02717114e-01 -7.58796410e-01 3.66685918e-01 1.60045219e-01 8.40252779e-02 2.14645175e-02 -1.87350749e-02 6.89082276e-02 4.26105276e-02 9.97784975e-02 -8.22590084e-02 -3.30366937e-02] [-2.87254690e-01 1.31621757e-01 2.01146908e-01 -3.80554832e-02 1.38995553e-01 -7.13474241e-02 -3.92275358e-01 -6.76034223e-02 5.53261885e-02 -1.82323962e-01 -3.65229874e-02 1.47247511e-01 -4.89102355e-02 5.90534770e-02 -6.74107885e-01 1.63466948e-01 -2.59100771e-01 2.48832011e-01] [-3.02421105e-01 -4.61430061e-02 -6.34621085e-02 1.08954287e-01 8.00174278e-02 -1.69006151e-02 1.63371282e-01 2.77371950e-01 7.46784853e-02 2.73033778e-01 4.68505530e-01 6.52730855e-01 4.74162132e-03 -1.62108150e-01 -4.99754439e-04 -6.36582307e-02 1.20629778e-01 9.80561531e-02] [-2.69713545e-01 -1.97931263e-01 -5.62851689e-02 -2.54355087e-01 -1.33744367e-01 -1.38183653e-01 1.61910525e-01 1.10544748e-01 2.66666666e-01 -5.05987218e-02 -5.45526034e-01 7.52188680e-02 3.70499547e-03 -3.93288246e-01 1.74861248e-01 -1.33284415e-01 -1.86241567e-01 3.60765151e-01] [-9.78607336e-02 -2.57839952e-01 6.19927464e-02 -6.12765722e-01 -1.23601456e-01 -5.77828612e-01 9.27633094e-02 -1.86858758e-01 -3.86296562e-02 -3.43037888e-02 2.65023238e-01 -2.40287269e-02 8.90928349e-03 1.63771153e-01 -6.31976228e-02 2.14665592e-02 1.24639367e-01 -1.77647590e-01] [-1.95200137e-01 -1.08045626e-01 1.48957820e-01 2.78678159e-01 6.34893356e-01 -2.89096995e-01 3.98266293e-01 -4.62187969e-02 -1.37163365e-01 1.77960797e-01 -1.92846020e-01 -2.29741488e-01 4.09727876e-03 1.36576102e-01 -9.62482815e-02 -6.89934316e-02 1.40804371e-01 9.99006987e-02] [-3.10523932e-01 7.52853487e-02 -1.09042833e-01 5.39294828e-03 -8.55574543e-02 9.77471088e-02 9.23519412e-02 6.46204209e-02 -1.31567659e-01 -1.43132644e-01 9.67172431e-02 -1.53118496e-01 8.55513044e-01 6.48917601e-02 -4.36596954e-02 -1.56585696e-01 -1.43109720e-01 -5.28457504e-02] [ 3.09006904e-01 -1.32299375e-02 9.08526930e-02 6.52148575e-02 7.90734442e-02 -7.57282937e-02 -1.04070600e-01 -1.92342823e-01 2.89633509e-01 -7.93831124e-02 -2.29926427e-02 2.33454000e-02 2.61858734e-01 -4.96273257e-01 -3.08568675e-01 -2.44030327e-01 5.11966770e-01 -9.49906147e-02] [-3.07287000e-01 8.75601978e-02 -1.06070496e-01 3.08991500e-02 -8.16463820e-02 1.05403228e-01 9.31317767e-02 1.38684573e-02 -8.95291026e-02 -2.39896699e-01 1.59356923e-01 -2.17636238e-01 -4.22479708e-01 -1.13664100e-01 -1.63739102e-01 -6.71547392e-01 -6.75916711e-02 -2.16727165e-01] [-2.78154157e-01 1.22154240e-01 2.13684693e-01 4.14674720e-02 2.51112937e-01 -7.81962142e-02 -3.54564344e-01 -2.15163418e-01 -1.58231983e-01 -3.82739482e-01 -1.42837015e-01 3.15261003e-01 2.00493082e-02 -8.66067604e-03 5.08763287e-01 -5.00643538e-02 1.60926059e-01 -2.00262071e-01] [-2.99765086e-01 7.72657535e-02 -1.44599805e-01 -6.40050869e-02 -1.47471227e-01 1.32912405e-01 6.80546125e-02 1.95678724e-01 4.27034669e-02 1.66090908e-01 -4.59667614e-01 1.18383161e-01 -4.15194745e-02 1.35985919e-01 -2.52182911e-01 2.17416166e-01 3.24139804e-01 -5.53139002e-01] [-3.05532374e-01 7.15030171e-02 -1.10343735e-01 -2.19687048e-03 -1.10100984e-01 1.15398218e-01 9.01194270e-02 3.77948210e-02 -1.51072666e-01 -2.87457686e-01 2.09345615e-01 -3.31340876e-01 -1.22365190e-01 -2.42922436e-01 3.94502237e-02 4.48936624e-01 4.62827872e-01 3.22499534e-01] [-2.63237620e-01 2.10582046e-01 2.02870191e-01 -8.55396458e-02 5.21210685e-03 -6.70573978e-02 -4.55292717e-01 1.46752664e-01 2.63771332e-01 5.49626527e-01 1.07713508e-01 -3.99260390e-01 1.66056546e-02 -3.30876118e-02 2.03029913e-01 -1.06621517e-01 8.55669069e-02 2.40609291e-02] [ 4.19359352e-02 5.03621577e-01 -7.38640211e-02 -1.15399624e-01 -1.38068605e-01 -1.31513077e-01 8.58226790e-02 -3.30394999e-01 -5.55267166e-01 3.62547303e-01 -1.26596148e-01 1.21942784e-01 1.27186667e-03 -2.96030848e-01 -5.79407509e-02 -3.08034829e-02 -5.10909842e-02 8.79644677e-02] [-3.60832115e-02 -1.57663214e-02 5.59173987e-01 4.73703309e-01 -5.66552244e-01 -3.19176094e-01 1.24532179e-01 1.14255395e-01 -5.99039250e-02 -5.79891873e-02 -3.25785780e-02 2.88590518e-03 -4.24341185e-04 4.01635562e-03 -8.22261600e-03 2.05544442e-02 -4.39201991e-03 -3.76172016e-02] [-5.87204797e-02 -9.27462386e-02 -6.70680496e-01 4.28426032e-01 -1.30869817e-01 -4.68404967e-01 -3.02517700e-01 -1.15403870e-01 5.23845772e-02 1.28995278e-02 -3.62255133e-02 -1.62495314e-02 -9.40554994e-03 8.00562035e-02 1.12172401e-02 -2.31296836e-03 1.13702813e-02 4.44850199e-02] [-3.80131449e-02 -5.01621218e-01 6.22407145e-02 -2.74095968e-02 -1.80519293e-01 2.80136438e-01 -2.58250261e-01 -9.46599623e-02 -3.79168935e-01 1.87848521e-01 -1.38657118e-01 8.24506703e-02 2.60800892e-02 2.45816461e-01 -7.88567114e-02 -2.81093089e-01 3.19960307e-01 3.19055407e-01] [-8.47399995e-02 -5.07612106e-01 4.17053530e-02 9.60374943e-02 1.10788067e-01 5.94444089e-02 -1.73269228e-01 -6.49718344e-03 -2.80340510e-01 1.33402674e-01 8.39926899e-02 -1.29951586e-01 -4.18109835e-03 -5.18420304e-01 -3.18514877e-02 2.41164948e-01 -3.10989286e-01 -3.65128378e-01]]
PCA_REDUCED = np.array(EIGEN_VECTORS[0:10])
X_STD_10 = np.dot(X_STD,PCA_REDUCED.T)
PCA_DIM10 = pd.DataFrame(X_STD_10)
PCA_DIM10.head()
| 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | -0.187067 | 0.291304 | -0.109543 | -0.341230 | -1.366761 | 1.109115 | 0.031458 | -0.007252 | -0.590437 | 0.083418 |
| 1 | -0.578067 | -0.165120 | -0.783569 | 0.796478 | 0.188025 | -0.690496 | -0.230511 | -0.823160 | 0.146608 | 0.214858 |
| 2 | 1.642473 | -1.488873 | 1.056498 | -1.066004 | -1.230999 | 0.432837 | 0.484535 | 0.679934 | -1.254475 | 0.788387 |
| 3 | -1.200693 | 0.086752 | -0.357946 | 1.211271 | -0.644191 | 0.329381 | -1.499561 | 0.876219 | 0.613940 | 0.174215 |
| 4 | -0.067616 | -0.374246 | 0.477125 | -1.266510 | -0.158059 | -0.413354 | 1.407978 | -1.152164 | 1.226641 | 0.455983 |
sns.pairplot(PCA_DIM10,diag_kind = "kde",palette = "Set2")
plt.show()
# FIRST WE WILL FIND OUT THE VARIANCE AND CUMULATIVE VARIANCE
TOTAL = sum(EIGEN_VALUES)
VAR = [( i /TOTAL ) * 100 for i in sorted(EIGEN_VALUES, reverse=True)]
C_VAR = np.cumsum(VAR)
print("CUMULATIVE VARIANCE", C_VAR)
CUMULATIVE VARIANCE [ 54.0993254 72.69242795 79.30893968 85.60048941 90.50578051 94.2028816 95.96829741 97.23446089 97.96148159 98.40463444 98.81191882 99.17047375 99.39323715 99.57233547 99.73544045 99.86140541 99.97135127 100. ]
plt.plot(VAR)
[<matplotlib.lines.Line2D at 0x20e4ab136d0>]
plt.figure(figsize=(8 , 6))
plt.bar(range(1, EIGEN_VALUES.size + 1), VAR, alpha = 0.5, align = 'center', label = 'INDIVIDUAL VARIANCE')
plt.step(range(1, EIGEN_VALUES.size + 1), C_VAR, where='mid', label = 'CUMULATIVE VARIANCE')
plt.ylabel('VARIANCE RATIO')
plt.xlabel('PRINCIPAL COMPONENTS')
plt.legend(loc = 'best')
plt.tight_layout()
plt.show()
plt.figure(figsize=(8 , 6))
plt.bar(range(1, EIGEN_VALUES.size + 1), VAR, alpha = 0.5, align = 'center', label = 'INDIVIDUAL VARIANCE')
plt.step(range(1, EIGEN_VALUES.size + 1), C_VAR, where='mid', label = 'CUMULATIVE VARIANCE')
plt.axhline(y = 90, color = 'r', linestyle = 'dashed', label = "THRESHOLD = 90%")
plt.ylabel('VARIANCE RATION')
plt.xlabel('PRINCIPAL COMPONENTS')
plt.legend(loc = 'best')
plt.tight_layout()
plt.show()
* IN THE ABOVE PLOT A DASHED RED LINE IS DRAWN ACROSS THE PLOT THAT REPRESENTS THE THRESHOLD OF 90%
* AT THE POINT WHERE THE LINE CUTS THE CUMULATIVE VARIANCE, WE CAN SEE THAT 5 PRINCIPLE COMPONENTS
ARE ENOUGH TO BE AT 90% THRESHOLD.
PCA_6D = PCA(n_components=6)
PCA_DATA_6D = PCA_6D.fit_transform(X_STD)
PCA_X_6D = pd.DataFrame(PCA_6D.components_,columns=list(X_STD))
P_6D = np.array(EIGEN_VECTORS[0:6])
X_STD_6D = np.dot(X_STD,P_6D.T)
PCA_DIM6 = pd.DataFrame(X_STD_6D)
X_TRAIN_6D, X_TEST_6D, Y_TRAIN_6D, Y_TEST_6D = train_test_split(PCA_DIM6, Y, test_size = 0.3, random_state = 10)
* FROM THE ABOVE PLOT, WE CAN SEE THAT 90% THRESHOLD CROSS THROUGH THE 5 COMPONENTS
* WE CAN SELECT 6 COMPONENTS WHERE THE THRESHOLD IS CROSSING ABOVE 90%
sns.pairplot(PCA_DIM6,diag_kind = "kde",palette = "Set2")
plt.show()
MODEL_TYPE = []
MODEL_TYPE.append(('SVM-LINEAR', SVC(kernel='linear',C=1,gamma=.6)))
MODEL_TYPE.append(('SVM-POLY', SVC(kernel='rbf',degree=2,C=.009)))
MODEL_TYPE.append(('SVM-RBF', SVC(kernel='poly',degree=2,gamma=0.1,C=.01)))
RESULTS = pd.DataFrame()
for name, model in MODEL_TYPE:
MODEL_EVAL = model
MODEL_EVAL.fit(X_TRAIN_6D, Y_TRAIN_6D)
Y_PRED = MODEL_EVAL.predict(X_TEST_6D)
ACCURACY = accuracy_score(Y_TEST_6D, Y_PRED)
PRECISION = precision_score(Y_TEST_6D, Y_PRED,average='weighted')
RECALL = recall_score(Y_TEST_6D, Y_PRED,average='weighted')
F1_SCORE = f1_score(Y_TEST_6D, Y_PRED,average='weighted')
RESULTS = RESULTS.append(pd.Series([name,ACCURACY,PRECISION,RECALL,F1_SCORE]),ignore_index=True)
RESULTS.columns = ['MODEL_NAME','ACCURACY','PRECISION','RECALL','F1-SCORE']
print(RESULTS)
MODEL_NAME ACCURACY PRECISION RECALL F1-SCORE 0 SVM-LINEAR 0.803150 0.807035 0.803150 0.804094 1 SVM-POLY 0.492126 0.242188 0.492126 0.324621 2 SVM-RBF 0.492126 0.242188 0.492126 0.324621
SVM_MODEL = SVC(kernel='linear',C=1,gamma=.6)
SVM_MODEL.fit(X_TRAIN_6D, Y_TRAIN_6D)
PREDICTION = SVM_MODEL.predict(X_TEST_6D)
print('ACCURACY ON TRAIN DATA AFTER APPLYING PCA:',SVM_MODEL.score(X_TRAIN_6D,Y_TRAIN_6D))
print('\n\nACCURACY ON TEST DATA AFTER APPLYING PCA:',SVM_MODEL.score(X_TEST_6D,Y_TEST_6D))
ACCURACY ON TRAIN DATA AFTER APPLYING PCA: 0.8378378378378378 ACCURACY ON TEST DATA AFTER APPLYING PCA: 0.8031496062992126
print(classification_report(Y_TEST_6D,PREDICTION),'\n')
precision recall f1-score support
0 0.85 0.82 0.83 71
1 0.82 0.79 0.81 125
2 0.71 0.81 0.76 58
accuracy 0.80 254
macro avg 0.80 0.81 0.80 254
weighted avg 0.81 0.80 0.80 254
* WE HAVE SELECTED ONLY 6 COMPONENTS THAT CAN BE USED TO ACHIEVE 90% OF MODEL ACCURACY OVER THE EARLIER MODEL
* ON REDUCING THE COMPONENTS, WE CAN SEE THAT ACCURACY, PRECISION, RECALL, F1-SCORE GOT REDUCED.
* STILL THE MODEL IS HAVING ACCURACY OF 83.7% ON TEST DATA AND 80.3% ON TEST DATA.
PCA_FINAL = PCA(n_components=18)
PCA_X_FINAL = PCA_FINAL.fit_transform(X_STD)
PCA_FINAL_DF = pd.DataFrame(data = PCA_X_FINAL, columns = list(X_STD))
COV_MX_FINAL = np.cov(X_STD.T)
EIGEN_VALUES_FIN, EIGEN_VECTORS_FIN = np.linalg.eig(COV_MX_FINAL)
PCA_RESULTS = pd.DataFrame()
for i in np.arange(1,19,1):
PCA_MIN = np.array(EIGEN_VECTORS_FIN[0:i])
X_STD_D = np.dot(X_STD,PCA_MIN.T)
PCA_DF_FIN = pd.DataFrame(X_STD_D)
X_TRAIN_PCA, X_TEST_PCA, Y_TRAIN_PCA, Y_TEST_PCA = train_test_split(PCA_DF_FIN, Y, test_size = 0.3, random_state = 10)
SVM_MODEL = SVC(kernel='linear',C=1,gamma=.6)
SVM_MODEL.fit(X_TRAIN_PCA, Y_TRAIN_PCA)
PREDICTION = SVM_MODEL.predict(X_TEST_PCA)
TRAIN_ACC = SVM_MODEL.score(X_TRAIN_PCA,Y_TRAIN_PCA)
TEST_ACC = SVM_MODEL.score(X_TEST_PCA,Y_TEST_PCA)
PCA_RESULTS = PCA_RESULTS.append(pd.Series([i,TRAIN_ACC,TEST_ACC]),ignore_index=True)
PCA_RESULTS.columns = ['NO. OF PRINCIPAL COMPONENTS', 'TRAINING ACCURACY','TESTING ACCURACY']
print('COMPARING MODEL PERFORMANCE AT SELECTION OF DIFFERENT PRINCIPLE COMPONENTS:\n')
print('***************************************************************************\n')
print(PCA_RESULTS)
COMPARING MODEL PERFORMANCE AT SELECTION OF DIFFERENT PRINCIPLE COMPONENTS:
***************************************************************************
NO. OF PRINCIPAL COMPONENTS TRAINING ACCURACY TESTING ACCURACY
0 1.0 0.559122 0.515748
1 2.0 0.565878 0.523622
2 3.0 0.572635 0.531496
3 4.0 0.709459 0.696850
4 5.0 0.807432 0.775591
5 6.0 0.837838 0.803150
6 7.0 0.836149 0.826772
7 8.0 0.841216 0.842520
8 9.0 0.858108 0.858268
9 10.0 0.871622 0.866142
10 11.0 0.871622 0.877953
11 12.0 0.875000 0.885827
12 13.0 0.905405 0.885827
13 14.0 0.902027 0.885827
14 15.0 0.945946 0.921260
15 16.0 0.947635 0.933071
16 17.0 0.949324 0.952756
17 18.0 0.957770 0.933071
* WE HAVE RAN THE SVM MODEL ON DIFFERENT SELECTION OF PRINCIPLE COMPONENTS.
* AS THE NO. OF COMPONENTS ARE GETTING INCREASED, WE CAN SEE THAT MODEL PERFORMANCE IS ALSO INCREASED.
* AT 13 COMPONENTS, WE CAN SEE THAT MODEL IS ACHIEVING MORE THAN 90% ACCURACY.
* WHEN ALL COMPONENTS ARE SELECTED, WE CAN SEE THAT MODEL IS HAVING MAXIMUM ACCURACY OF 95.77%
* AT 15 COMPONENTS MODEL ACCURACY IS 94.59% AND AFTER THAT WE COULD SEE ONLY MARGINAL INCREASE IN ACCURACY.
* FROM NO. OF COMPONENTS AT 5, MODEL STARTED PERFORMING GOOD.
* IN THIS ASSESMENT TO KEEP THE MODEL PERFORMANCE AT 90%, I AM CONSIDERING 13 COMPONENTS TO TRAIN THE
SVM MODEL FURTHER AND PARAMETER TUNING
fig, axes = plt.subplots(1, 2, figsize=(15, 5), sharey=True)
sns.scatterplot(x=range(1,19),y=PCA_RESULTS['TRAINING ACCURACY'],hue = PCA_RESULTS['NO. OF PRINCIPAL COMPONENTS'],
ax=axes[0],palette="deep",legend=False)
sns.scatterplot(x=range(1,19),y=PCA_RESULTS['TESTING ACCURACY'],hue = PCA_RESULTS['NO. OF PRINCIPAL COMPONENTS'],
ax=axes[1],palette="deep",legend=False)
axes[0].set_title("TRAINING_DATA_ACCURACY")
axes[1].set_title("TESTING_DATA_ACCURACY")
plt.show()
PCA_TUNED = PCA(n_components=13)
PCA_X_TUNED = PCA_TUNED.fit_transform(X_STD)
PCA_TUNED_DF = pd.DataFrame(PCA_TUNED.components_,columns=list(X_STD))
COV_MX_TUNED = np.cov(X_STD.T)
EIGEN_VALUES_TUNED, EIGEN_VECTORS_TUNED = np.linalg.eig(COV_MX_TUNED)
PCA_OPT = np.array(EIGEN_VECTORS_FIN[0:13])
X_STD_13D = np.dot(X_STD,PCA_OPT.T)
PCA_DF_TUNED = pd.DataFrame(X_STD_13D)
X_TRAIN_13D, X_TEST_13D, Y_TRAIN_13D, Y_TEST_13D = train_test_split(PCA_DF_TUNED, Y, test_size = 0.3, random_state = 10)
SVM_MODEL = SVC(kernel='linear',C=1,gamma=.6)
SVM_MODEL.fit(X_TRAIN_13D, Y_TRAIN_13D)
PREDICTION = SVM_MODEL.predict(X_TEST_13D)
TRAIN_ACC = SVM_MODEL.score(X_TRAIN_13D,Y_TRAIN_13D)
TEST_ACC = SVM_MODEL.score(X_TEST_13D,Y_TEST_13D)
print('TRAINING DATA ACCURACY:',TRAIN_ACC)
print('TESTING DATA ACCURACY:',TEST_ACC)
TRAINING DATA ACCURACY: 0.9054054054054054 TESTING DATA ACCURACY: 0.8858267716535433
MODEL = SVC()
PARAMS_C = {'C': np.arange( 1, 100+1, 1 ).tolist(),
'gamma': np.arange( 0.0, 10.0+0.0, 0.1 ).tolist(),
'kernel': ['linear','rbf']}
RANDOM_SEARCH = RandomizedSearchCV(model, param_distributions=PARAMS_C,
n_iter=50, scoring='accuracy',n_jobs=-1,cv=5)
RANDOM_SEARCH.fit(X_TRAIN,Y_TRAIN)
RandomizedSearchCV(cv=5,
estimator=SVC(C=0.01, degree=2, gamma=0.1, kernel='poly'),
n_iter=50, n_jobs=-1,
param_distributions={'C': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
12, 13, 14, 15, 16, 17, 18, 19,
20, 21, 22, 23, 24, 25, 26, 27,
28, 29, 30, ...],
'gamma': [0.0, 0.1, 0.2,
0.30000000000000004, 0.4, 0.5,
0.6000000000000001,
0.7000000000000001, 0.8, 0.9,
1.0, 1.1, 1.2000000000000002,
1.3, 1.4000000000000001, 1.5,
1.6, 1.7000000000000002, 1.8,
1.9000000000000001, 2.0, 2.1,
2.2, 2.3000000000000003,
2.4000000000000004, 2.5, 2.6,
2.7, 2.8000000000000003,
2.9000000000000004, ...],
'kernel': ['linear', 'rbf']},
scoring='accuracy')
RANDOM_SEARCH.best_params_
{'kernel': 'rbf', 'gamma': 0.1, 'C': 17}
* SINCE WE ARE USING RANDOMISED SEARCH CV FUNCTION TO FIND BEST HYPER PARAMETERS DO REMEMBER THAT:
* EVERY TIME WE RUN THE ABOVE CODE, EXPECT TO GET A NEW BEST HYPER PARAMETERS AT EVERY INSTANCE
* TO REDUCE THE COMPUTATION TIME AND OVER LOAD ON THE SYSTEM RESOURCES, I PREFERED TO USE THIS METHOD
OVER GRID SEARCH SINCE GRID SEARCH IS TAKING TOO MUCH OF TIME TO RUN.
SVM_BEST = SVC(kernel= 'rbf', gamma= 0.1, C= 17)
BEST_RESULTS = pd.DataFrame()
SVM_BEST.fit(X_TRAIN, Y_TRAIN)
PREDICTION = SVM_BEST.predict(X_TEST)
TRAIN_ACC = SVM_BEST.score(X_TRAIN,Y_TRAIN)
TEST_ACC = SVM_BEST.score(X_TEST,Y_TEST)
BEST_RESULTS = BEST_RESULTS.append(pd.Series(['BEFORE PCA',TRAIN_ACC,TEST_ACC]),ignore_index=True)
SVM_BEST.fit(X_TRAIN_6D, Y_TRAIN_6D)
PREDICTION = SVM_BEST.predict(X_TEST_6D)
TRAIN_ACC = SVM_BEST.score(X_TRAIN_6D,Y_TRAIN_6D)
TEST_ACC = SVM_BEST.score(X_TEST_6D,Y_TEST_6D)
BEST_RESULTS = BEST_RESULTS.append(pd.Series(['AFTER PCA WITH 6 DIMENSIONS',TRAIN_ACC,TEST_ACC]),ignore_index=True)
SVM_BEST.fit(X_TRAIN_13D, Y_TRAIN_13D)
PREDICTION = SVM_BEST.predict(X_TEST_13D)
TRAIN_ACC = SVM_BEST.score(X_TRAIN_13D,Y_TRAIN_13D)
TEST_ACC = SVM_BEST.score(X_TEST_13D,Y_TEST_13D)
BEST_RESULTS = BEST_RESULTS.append(pd.Series(['AFTER PCA WITH 13 DIMENSIONS',TRAIN_ACC,TEST_ACC]),ignore_index=True)
BEST_RESULTS.columns = ['BEFORE/AFTER PCA','TRAINING ACCURACY','TESTING ACCURACY']
BEST_RESULTS
| BEFORE/AFTER PCA | TRAINING ACCURACY | TESTING ACCURACY | |
|---|---|---|---|
| 0 | BEFORE PCA | 1.000000 | 0.984252 |
| 1 | AFTER PCA WITH 6 DIMENSIONS | 0.935811 | 0.921260 |
| 2 | AFTER PCA WITH 13 DIMENSIONS | 0.996622 | 0.944882 |
fig, axes = plt.subplots(1, 2, figsize=(18, 7), sharey=True)
sns.scatterplot(x=BEST_RESULTS['BEFORE/AFTER PCA'],y=BEST_RESULTS['TRAINING ACCURACY'],hue = BEST_RESULTS['BEFORE/AFTER PCA'],
ax=axes[0],palette="deep")
sns.scatterplot(x=BEST_RESULTS['BEFORE/AFTER PCA'],y=BEST_RESULTS['TESTING ACCURACY'],hue = BEST_RESULTS['BEFORE/AFTER PCA'],
ax=axes[1],palette="deep")
axes[0].set_title("TRAINING_DATA_ACCURACY")
axes[1].set_title("TESTING_DATA_ACCURACY")
plt.show()
* WE CAN EITHER USE GRID_SEARCHCV OR RANDOMISED_SEARCHCV FOR FINDING BEST HYPER PARAMETERS.
* I USED RANDOMISED_SEARCHCV TO FIND BEST PARAMETERS TO REDUCE COMPUTATION TIME OVER USING GRID_SEARCHCV.
* WE RE-TRAINED THE SVM MODEL WITH THE BEST HYPER PARAMETERS AND RESULTS WERE PUBLISHED.
* FROM THE ABOVE OUTPUT WE CAN DEDUCT BELOW FINDINGS:
* MODEL PERFORMANCE BEFORE PCA WITH BEST PARAMETERS IS AT MAX 100%
* ON REDUCING DIMENSIONS TO 6, MODEL STILL PERFORMED GREAT AT 93.5% ACCURACY
* WE CAN SEE THAT AT 13 DIMENSIONS, MODEL TRAINING DATA ACCURACY IS ALMOST NEAR TO 100%
* DO NOTE THAT FOR EVERY SET OF OUTPUT PARAMETERS FROM RANDOMISED SEARCH CV, WE GET VARIATION IN THE RESULTS
print('BEST PARAMETERS FROM HYPER TUNING:\n',RANDOM_SEARCH.best_params_)
BEST PARAMETERS FROM HYPER TUNING:
{'kernel': 'rbf', 'gamma': 0.1, 'C': 17}
* WE USED RANDOMISED_SEARCHCV TO FIND THE BEST PARAMETERS WHICH ARE AS BELOW:
KERNEL: RBF
GAMMA: 0.1
C: 17
ALSO WE COULD SEE THAT ON REDUCING TO 6 DIMENSIONS WE ARE STILL GETTING GOOD MODEL PERFORMANCE
ACCURACY IS REDUCED ON REDUCING THE DIMENSIONS BUT STILL WE ARE HAVING THE ACCURACY MORE THAN 90%
SVM_BEST.fit(X_TRAIN, Y_TRAIN)
PREDICTION = SVM_BEST.predict(X_TEST)
print('CLASSIFICATION REPORT BEFORE APPLYING PCA DIMENSION REDUCTION\n')
print(classification_report(Y_TEST,PREDICTION),'\n')
SVM_BEST.fit(X_TRAIN_6D, Y_TRAIN_6D)
PREDICTION = SVM_BEST.predict(X_TEST_6D)
print('CLASSIFICATION REPORT AFTER APPLYING PCA DIMENSION REDUCTION TO 6 DIMENSIONS\n')
print(classification_report(Y_TEST_6D,PREDICTION),'\n')
SVM_BEST.fit(X_TRAIN_13D, Y_TRAIN_13D)
PREDICTION = SVM_BEST.predict(X_TEST_13D)
print('CLASSIFICATION REPORT AFTER APPLYING PCA DIMENSION REDUCTION TO 13 DIMENSIONS\n')
print(classification_report(Y_TEST_13D,PREDICTION),'\n')
CLASSIFICATION REPORT BEFORE APPLYING PCA DIMENSION REDUCTION
precision recall f1-score support
0 1.00 1.00 1.00 71
1 0.97 1.00 0.98 125
2 1.00 0.93 0.96 58
accuracy 0.98 254
macro avg 0.99 0.98 0.98 254
weighted avg 0.98 0.98 0.98 254
CLASSIFICATION REPORT AFTER APPLYING PCA DIMENSION REDUCTION TO 6 DIMENSIONS
precision recall f1-score support
0 0.96 0.94 0.95 71
1 0.94 0.93 0.94 125
2 0.84 0.88 0.86 58
accuracy 0.92 254
macro avg 0.91 0.92 0.91 254
weighted avg 0.92 0.92 0.92 254
CLASSIFICATION REPORT AFTER APPLYING PCA DIMENSION REDUCTION TO 13 DIMENSIONS
precision recall f1-score support
0 0.99 0.94 0.96 71
1 0.94 0.95 0.95 125
2 0.90 0.93 0.92 58
accuracy 0.94 254
macro avg 0.94 0.94 0.94 254
weighted avg 0.95 0.94 0.95 254
* LOOKING AT THE ABOVE SCORES, WE CAN SEE THAT MODEL IS HAVING MORE THAN 90% ACCURACY ON ALL 3 SCENARIOS.
* ON RAW DATA, TRAINING ACCURACY IS HIGH AT 98%
* WE HAVE REDUCED THE DIMENSIONS TO 6 AND 13 USING PCA.
* AT 6 DIMENSIONS, ACCURACY IS STILL 92% SHOWING THE COMPONENTS ARE CORRELATED STRONGLY
* AT 13 DIMENSIONS, TRAINING ACCURACY IS 94% WHICH SHOWS THE INCREASE IN HIGHLY CORRELATED COMPONENTS.
* OVER ALL AFTER PCA DIMENSIONALITY REDUCTION, MODEL IS STILL PERFORMING VERY BEST.
WHAT IS PCA?
************
* PCA IS A UN-SUPERVISED LEARNING TECHNIQUE THAT HELPS IN REDUCING THE DIMENSIONALITY IN THE INPUT DATA SET.
* WHEN A DATA SET IS HAVING HIGHER DIMENSIONALITY, PCA TREATS EACH VARIABLE AS PRINCIPLE COMPONENT AND THEN
PAIRS WITH OTHER VARIABLE BY IDENTIFYING THE RELATIONSHIP BETWEEN THEM.
* BY THIS WE CAN SEE THAT THE FIRST PRINCIPLE COMPONENT WILL BE HAVING MORE VARIATION AND IT GETS DECREASED AS
WE MOVE DOWN TO LAST PRINCIPLE COMPONENTS WHICH HAVE LEAST VARIATION.
* AS WE REDUCE THE DIMENSIONS, WE COULD THE DECREASE IN ACCURACY OF THE MODEL.
* THIS DIMENSIONALITY REDUCTION ALSO ALLOWS TO TRAVERSE THROUGH MINIMUM COLUMNS AND VISUALIZE THEM EASILY
* THIS MAKES THE MACHINE LEARNING MODEL EASIER TO ANALYSE AND MAKES IT FASTER TO PROCESS.
* INORDER TO ACHIEVE THIS WE NEED TO HAVE SOME ASSUMPTIONS ON THE DATA SET WHICH WILL BE DISCUSSED BELOW.
PRE-REQUISITE / ASSUMPTIONS OF PCA:
***********************************
THERE ARE SOME ASSUMPTIONS TO BE MADE THAT MAKES THE DIMENSIONALITY REDUCTION EASIER:
1. DATA SET IS LINEAR: ALL THE VARIABLES IN THE DATASET ARE IN LINEAR MANNER AND CONTAINS RELATIONSHIPS AMONG
THEM SELVES.
2. INDEPENDENT VARIABLES IN THE DATA SET ARE HIGHLY CORRELATED TO EACH OTHER AND REDUCED FEATURE SET REPRESENTS
THE ORIGINAL DATA SET IN EFFECTIVE MANNER.
3. DATA SET CONTAINS VERY LESS NUMBER OF OUTLIERS WHICH DEVIATES FROM MOST OF THE DATA POINTS AND HAVING MORE
OUTLIERS MEANS THAT ERRORS ARE MORE AND REDUCES MODEL OVERALL PERFORMANCE.
4. ALL THE FEATURES ARE LOW DIMENSIONAL AND NUMERIC IN NATURE.
5. PRINCIPLE COMPONENTS WITH HIGHER VARIANCE ARE GIVEN UTMOST IMPORTANCE WHERE AS PRINCIPLE COMPONENTS WITH
LOWER VARIANCE ARE TREATED AS NOISE.
ADVANTAGES OF PCA:
******************
* DIMENSIONALITY REDUCTION HELPS IN VISUALIZING THE DATA SET EASIER DUE TO LESS FEATURES THAT MAKES IT EASIER
* HELPS IN FINDING THE MOST IMPORTANT FEATURES OR VARIABLES THAT ARE NOT CO-RELATED
* INCREASES THE SPEED OF MACHINE LEARNING ALGORITHM AS THERE ARE LESS FEATURES TO ANALYZE. IN AN OVERALL PCA
INCREASES OVERALL ALGORITHM PERFORMANCE.
* REDUCES THE OVER FITTING OF DATA. MORE VARIABLES CAUSES OVER FITTING OF DATA AND AS PCA REDUCES DIMENSIONALITY
WE HAVE LESS FEATURES OR ONLY IMPORTANT FEATURES THAT REDUCES OVER FITTING OF THE DATA.
* SIMPLIFY THE COMPLEX BUSINESS PROBLEMS AS IN PCA WE NEED TO TRAIN THE MODEL ONLY ON THE PRINCIPAL COMPONENTS
THAT REDUCES THE SIZE OF THE VARIABLES THAT NEEDS TO BE ANALYSED EARLIER.
LIMITATIONS OF PCA:
*******************
* PCA RELIES ON GIVING PRIORITY TO VARIABLES THAT HAVE MORE VARIATION ON THE DATASET AND HENCE WE ARE
LOSING SOME FEATURES WHEN TRAINING THE MODEL USING PCA, THUS LEADING TO INFORMATION LOSS.
* BEFORE APPLYING PCA, WE MUST STANDARDISE THE DATA SET, IF WE DO NOT STANDARDISE THE DATA, IT BECOMES DIFFICULT
FOR PCA TO FIND OUT THE IMPORTANT FEATURES.
* THE STANDARDISATION HAS TO BE DONE ON ALL THE FEATURES BEFORE APPLYING PCA BUT ON LATER STAGE WE TENDS TO
USE ONLY COLUMNS THAT HAS HIGH VARIATION ON THE DATA.
* SINCE WE ARE STANDARSING THE DATA AND USING THE PRINCIPLE COMPONENTS, WE CANNOT INTERPRET THE ORIGINAL
VARIABLES IN THE DATA SET. ALSO NOTE THAT THESE PRINCIPAL COMPONENTS ARE NOT AS READABLE AS ORIGINAL DATA
* WE ARE ASSUMING THE DATA SET TO BE LINEAR AND HENCE IT IS NOT SUITABLE TO CAPTURE THE NON-LINEAR DATA SET.
* HAVING MORE OUTLIERS IN THE DATA CAN MAKE THE PCA TO IDENTIFY THE PRINCIPAL COMPONENTS DIFFICULT.
********** END OF UNSUPERVISED LEARNING ASSESMENT **********